{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 90473, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011053021343384213, "grad_norm": 4.114652156829834, "learning_rate": 2.763957987838585e-07, "loss": 1.1961, "step": 100 }, { "epoch": 0.0022106042686768426, "grad_norm": 2.9325754642486572, "learning_rate": 5.52791597567717e-07, "loss": 1.0048, "step": 200 }, { "epoch": 0.0033159064030152644, "grad_norm": 3.6297736167907715, "learning_rate": 8.291873963515755e-07, "loss": 0.8568, "step": 300 }, { "epoch": 0.004421208537353685, "grad_norm": 2.905796766281128, "learning_rate": 1.105583195135434e-06, "loss": 0.8767, "step": 400 }, { "epoch": 0.005526510671692107, "grad_norm": 4.603543758392334, "learning_rate": 1.3819789939192927e-06, "loss": 0.7411, "step": 500 }, { "epoch": 0.006631812806030529, "grad_norm": 3.2700424194335938, "learning_rate": 1.658374792703151e-06, "loss": 0.7773, "step": 600 }, { "epoch": 0.00773711494036895, "grad_norm": 3.0455334186553955, "learning_rate": 1.9347705914870095e-06, "loss": 0.7291, "step": 700 }, { "epoch": 0.00884241707470737, "grad_norm": 3.2247352600097656, "learning_rate": 2.211166390270868e-06, "loss": 0.6383, "step": 800 }, { "epoch": 0.009947719209045794, "grad_norm": 3.087158441543579, "learning_rate": 2.4875621890547264e-06, "loss": 0.6445, "step": 900 }, { "epoch": 0.011053021343384215, "grad_norm": 1.812444806098938, "learning_rate": 2.7639579878385854e-06, "loss": 0.631, "step": 1000 }, { "epoch": 0.012158323477722636, "grad_norm": 3.248868465423584, "learning_rate": 3.0403537866224434e-06, "loss": 0.6189, "step": 1100 }, { "epoch": 0.013263625612061057, "grad_norm": 1.8088688850402832, "learning_rate": 3.316749585406302e-06, "loss": 0.5809, "step": 1200 }, { "epoch": 0.014368927746399479, "grad_norm": 1.9592525959014893, "learning_rate": 3.5931453841901604e-06, "loss": 0.5885, "step": 1300 }, { "epoch": 0.0154742298807379, "grad_norm": 2.4960570335388184, "learning_rate": 3.869541182974019e-06, "loss": 0.5743, "step": 1400 }, { "epoch": 0.01657953201507632, "grad_norm": 1.895150899887085, "learning_rate": 4.145936981757877e-06, "loss": 0.5897, "step": 1500 }, { "epoch": 0.01768483414941474, "grad_norm": 2.611772060394287, "learning_rate": 4.422332780541736e-06, "loss": 0.5213, "step": 1600 }, { "epoch": 0.018790136283753162, "grad_norm": 3.9915366172790527, "learning_rate": 4.698728579325595e-06, "loss": 0.5928, "step": 1700 }, { "epoch": 0.019895438418091587, "grad_norm": 2.026409387588501, "learning_rate": 4.975124378109453e-06, "loss": 0.5726, "step": 1800 }, { "epoch": 0.02100074055243001, "grad_norm": 1.9394502639770508, "learning_rate": 4.999987004364365e-06, "loss": 0.5458, "step": 1900 }, { "epoch": 0.02210604268676843, "grad_norm": 2.1378285884857178, "learning_rate": 4.999942749379922e-06, "loss": 0.5452, "step": 2000 }, { "epoch": 0.02321134482110685, "grad_norm": 2.2720561027526855, "learning_rate": 4.999867108486303e-06, "loss": 0.5195, "step": 2100 }, { "epoch": 0.024316646955445272, "grad_norm": 2.4831795692443848, "learning_rate": 4.99976008263315e-06, "loss": 0.5431, "step": 2200 }, { "epoch": 0.025421949089783694, "grad_norm": 3.52687668800354, "learning_rate": 4.999621673164139e-06, "loss": 0.5703, "step": 2300 }, { "epoch": 0.026527251224122115, "grad_norm": 2.1417176723480225, "learning_rate": 4.999451881816949e-06, "loss": 0.5549, "step": 2400 }, { "epoch": 0.027632553358460536, "grad_norm": 2.2087039947509766, "learning_rate": 4.999250710723255e-06, "loss": 0.5664, "step": 2500 }, { "epoch": 0.028737855492798958, "grad_norm": 2.0288796424865723, "learning_rate": 4.999018162408687e-06, "loss": 0.5864, "step": 2600 }, { "epoch": 0.02984315762713738, "grad_norm": 1.9152870178222656, "learning_rate": 4.998754239792809e-06, "loss": 0.5568, "step": 2700 }, { "epoch": 0.0309484597614758, "grad_norm": 1.9485653638839722, "learning_rate": 4.998458946189078e-06, "loss": 0.5706, "step": 2800 }, { "epoch": 0.03205376189581422, "grad_norm": 2.10481595993042, "learning_rate": 4.9981322853048e-06, "loss": 0.5501, "step": 2900 }, { "epoch": 0.03315906403015264, "grad_norm": 1.8621227741241455, "learning_rate": 4.9977742612410905e-06, "loss": 0.5394, "step": 3000 }, { "epoch": 0.034264366164491064, "grad_norm": 2.057615280151367, "learning_rate": 4.997384878492817e-06, "loss": 0.5078, "step": 3100 }, { "epoch": 0.03536966829882948, "grad_norm": 1.742665410041809, "learning_rate": 4.996964141948542e-06, "loss": 0.5584, "step": 3200 }, { "epoch": 0.03647497043316791, "grad_norm": 2.150362253189087, "learning_rate": 4.996512056890468e-06, "loss": 0.5264, "step": 3300 }, { "epoch": 0.037580272567506325, "grad_norm": 2.3525052070617676, "learning_rate": 4.996028628994365e-06, "loss": 0.5828, "step": 3400 }, { "epoch": 0.03868557470184475, "grad_norm": 1.6484140157699585, "learning_rate": 4.9955138643295e-06, "loss": 0.52, "step": 3500 }, { "epoch": 0.039790876836183174, "grad_norm": 3.176095724105835, "learning_rate": 4.994967769358565e-06, "loss": 0.557, "step": 3600 }, { "epoch": 0.04089617897052159, "grad_norm": 1.66346275806427, "learning_rate": 4.9943903509375926e-06, "loss": 0.5121, "step": 3700 }, { "epoch": 0.04200148110486002, "grad_norm": 2.594338893890381, "learning_rate": 4.9937816163158685e-06, "loss": 0.4962, "step": 3800 }, { "epoch": 0.043106783239198435, "grad_norm": 2.330629348754883, "learning_rate": 4.993141573135843e-06, "loss": 0.5217, "step": 3900 }, { "epoch": 0.04421208537353686, "grad_norm": 2.264955759048462, "learning_rate": 4.9924702294330375e-06, "loss": 0.5157, "step": 4000 }, { "epoch": 0.04531738750787528, "grad_norm": 1.9724615812301636, "learning_rate": 4.991767593635935e-06, "loss": 0.5294, "step": 4100 }, { "epoch": 0.0464226896422137, "grad_norm": 1.9894862174987793, "learning_rate": 4.991033674565885e-06, "loss": 0.5556, "step": 4200 }, { "epoch": 0.04752799177655212, "grad_norm": 1.9730507135391235, "learning_rate": 4.990268481436984e-06, "loss": 0.4888, "step": 4300 }, { "epoch": 0.048633293910890545, "grad_norm": 2.208463430404663, "learning_rate": 4.989472023855966e-06, "loss": 0.5387, "step": 4400 }, { "epoch": 0.04973859604522896, "grad_norm": 2.394077777862549, "learning_rate": 4.988644311822076e-06, "loss": 0.4932, "step": 4500 }, { "epoch": 0.05084389817956739, "grad_norm": 2.514061689376831, "learning_rate": 4.987785355726953e-06, "loss": 0.5254, "step": 4600 }, { "epoch": 0.051949200313905805, "grad_norm": 1.8961576223373413, "learning_rate": 4.9868951663544885e-06, "loss": 0.5145, "step": 4700 }, { "epoch": 0.05305450244824423, "grad_norm": 2.2813808917999268, "learning_rate": 4.9859737548807005e-06, "loss": 0.4982, "step": 4800 }, { "epoch": 0.05415980458258265, "grad_norm": 2.1236634254455566, "learning_rate": 4.98502113287359e-06, "loss": 0.5206, "step": 4900 }, { "epoch": 0.05526510671692107, "grad_norm": 2.573836326599121, "learning_rate": 4.984037312292992e-06, "loss": 0.4844, "step": 5000 }, { "epoch": 0.05637040885125949, "grad_norm": 1.2394871711730957, "learning_rate": 4.983022305490431e-06, "loss": 0.4921, "step": 5100 }, { "epoch": 0.057475710985597915, "grad_norm": 2.2655134201049805, "learning_rate": 4.9819761252089635e-06, "loss": 0.5278, "step": 5200 }, { "epoch": 0.05858101311993633, "grad_norm": 1.9459484815597534, "learning_rate": 4.980898784583019e-06, "loss": 0.5215, "step": 5300 }, { "epoch": 0.05968631525427476, "grad_norm": 2.574147939682007, "learning_rate": 4.979790297138232e-06, "loss": 0.5155, "step": 5400 }, { "epoch": 0.060791617388613176, "grad_norm": 2.5039682388305664, "learning_rate": 4.9786506767912775e-06, "loss": 0.5245, "step": 5500 }, { "epoch": 0.0618969195229516, "grad_norm": 2.6227054595947266, "learning_rate": 4.977479937849689e-06, "loss": 0.4843, "step": 5600 }, { "epoch": 0.06300222165729003, "grad_norm": 2.1595468521118164, "learning_rate": 4.9762780950116865e-06, "loss": 0.4863, "step": 5700 }, { "epoch": 0.06410752379162844, "grad_norm": 1.8619611263275146, "learning_rate": 4.975045163365989e-06, "loss": 0.5083, "step": 5800 }, { "epoch": 0.06521282592596686, "grad_norm": 2.270404100418091, "learning_rate": 4.973781158391621e-06, "loss": 0.5516, "step": 5900 }, { "epoch": 0.06631812806030528, "grad_norm": 1.9068191051483154, "learning_rate": 4.972486095957725e-06, "loss": 0.5058, "step": 6000 }, { "epoch": 0.06742343019464371, "grad_norm": 2.2948782444000244, "learning_rate": 4.971159992323359e-06, "loss": 0.5018, "step": 6100 }, { "epoch": 0.06852873232898213, "grad_norm": 3.0896589756011963, "learning_rate": 4.969802864137289e-06, "loss": 0.5062, "step": 6200 }, { "epoch": 0.06963403446332055, "grad_norm": 1.7098015546798706, "learning_rate": 4.96841472843779e-06, "loss": 0.5067, "step": 6300 }, { "epoch": 0.07073933659765896, "grad_norm": 2.6850175857543945, "learning_rate": 4.966995602652417e-06, "loss": 0.5287, "step": 6400 }, { "epoch": 0.0718446387319974, "grad_norm": 1.6628856658935547, "learning_rate": 4.965545504597802e-06, "loss": 0.5225, "step": 6500 }, { "epoch": 0.07294994086633581, "grad_norm": 2.279022693634033, "learning_rate": 4.9640644524794205e-06, "loss": 0.5026, "step": 6600 }, { "epoch": 0.07405524300067423, "grad_norm": 0.924898624420166, "learning_rate": 4.962552464891363e-06, "loss": 0.5354, "step": 6700 }, { "epoch": 0.07516054513501265, "grad_norm": 2.779557228088379, "learning_rate": 4.961009560816109e-06, "loss": 0.4776, "step": 6800 }, { "epoch": 0.07626584726935108, "grad_norm": 2.554727077484131, "learning_rate": 4.9594357596242795e-06, "loss": 0.4821, "step": 6900 }, { "epoch": 0.0773711494036895, "grad_norm": 1.730661153793335, "learning_rate": 4.957831081074398e-06, "loss": 0.4903, "step": 7000 }, { "epoch": 0.07847645153802792, "grad_norm": 2.198575735092163, "learning_rate": 4.956195545312647e-06, "loss": 0.4946, "step": 7100 }, { "epoch": 0.07958175367236635, "grad_norm": 1.3369964361190796, "learning_rate": 4.954529172872605e-06, "loss": 0.51, "step": 7200 }, { "epoch": 0.08068705580670477, "grad_norm": 2.4426262378692627, "learning_rate": 4.952831984674998e-06, "loss": 0.5108, "step": 7300 }, { "epoch": 0.08179235794104318, "grad_norm": 3.9186463356018066, "learning_rate": 4.951104002027432e-06, "loss": 0.5086, "step": 7400 }, { "epoch": 0.0828976600753816, "grad_norm": 1.9639850854873657, "learning_rate": 4.9493452466241254e-06, "loss": 0.4758, "step": 7500 }, { "epoch": 0.08400296220972003, "grad_norm": 1.2126818895339966, "learning_rate": 4.94755574054564e-06, "loss": 0.5017, "step": 7600 }, { "epoch": 0.08510826434405845, "grad_norm": 2.206359386444092, "learning_rate": 4.945735506258598e-06, "loss": 0.537, "step": 7700 }, { "epoch": 0.08621356647839687, "grad_norm": 1.7051986455917358, "learning_rate": 4.943884566615409e-06, "loss": 0.4835, "step": 7800 }, { "epoch": 0.08731886861273529, "grad_norm": 1.832702398300171, "learning_rate": 4.942002944853973e-06, "loss": 0.454, "step": 7900 }, { "epoch": 0.08842417074707372, "grad_norm": 1.8357278108596802, "learning_rate": 4.940090664597394e-06, "loss": 0.4972, "step": 8000 }, { "epoch": 0.08952947288141214, "grad_norm": 2.1181540489196777, "learning_rate": 4.938147749853685e-06, "loss": 0.5184, "step": 8100 }, { "epoch": 0.09063477501575055, "grad_norm": 1.7029916048049927, "learning_rate": 4.936174225015463e-06, "loss": 0.5324, "step": 8200 }, { "epoch": 0.09174007715008897, "grad_norm": 2.0932748317718506, "learning_rate": 4.934170114859643e-06, "loss": 0.4806, "step": 8300 }, { "epoch": 0.0928453792844274, "grad_norm": 2.3745322227478027, "learning_rate": 4.932135444547129e-06, "loss": 0.4869, "step": 8400 }, { "epoch": 0.09395068141876582, "grad_norm": 2.1215474605560303, "learning_rate": 4.930070239622498e-06, "loss": 0.4777, "step": 8500 }, { "epoch": 0.09505598355310424, "grad_norm": 1.7763068675994873, "learning_rate": 4.9279745260136756e-06, "loss": 0.478, "step": 8600 }, { "epoch": 0.09616128568744266, "grad_norm": 1.950086236000061, "learning_rate": 4.925848330031617e-06, "loss": 0.5048, "step": 8700 }, { "epoch": 0.09726658782178109, "grad_norm": 2.959291696548462, "learning_rate": 4.923691678369971e-06, "loss": 0.513, "step": 8800 }, { "epoch": 0.09837188995611951, "grad_norm": 2.3258442878723145, "learning_rate": 4.921504598104745e-06, "loss": 0.4896, "step": 8900 }, { "epoch": 0.09947719209045792, "grad_norm": 2.5175669193267822, "learning_rate": 4.9192871166939715e-06, "loss": 0.4783, "step": 9000 }, { "epoch": 0.10058249422479634, "grad_norm": 1.981148600578308, "learning_rate": 4.917039261977353e-06, "loss": 0.4906, "step": 9100 }, { "epoch": 0.10168779635913477, "grad_norm": 2.439974069595337, "learning_rate": 4.914761062175925e-06, "loss": 0.5007, "step": 9200 }, { "epoch": 0.10279309849347319, "grad_norm": 2.8156814575195312, "learning_rate": 4.912452545891689e-06, "loss": 0.5203, "step": 9300 }, { "epoch": 0.10389840062781161, "grad_norm": 2.4708168506622314, "learning_rate": 4.9101137421072605e-06, "loss": 0.4663, "step": 9400 }, { "epoch": 0.10500370276215003, "grad_norm": 2.4594314098358154, "learning_rate": 4.907744680185508e-06, "loss": 0.5027, "step": 9500 }, { "epoch": 0.10610900489648846, "grad_norm": 1.7548918724060059, "learning_rate": 4.905345389869176e-06, "loss": 0.4534, "step": 9600 }, { "epoch": 0.10721430703082688, "grad_norm": 1.6353791952133179, "learning_rate": 4.902915901280517e-06, "loss": 0.49, "step": 9700 }, { "epoch": 0.1083196091651653, "grad_norm": 3.52217698097229, "learning_rate": 4.9004562449209146e-06, "loss": 0.4935, "step": 9800 }, { "epoch": 0.10942491129950371, "grad_norm": 1.6542017459869385, "learning_rate": 4.897966451670495e-06, "loss": 0.5118, "step": 9900 }, { "epoch": 0.11053021343384214, "grad_norm": 2.575944185256958, "learning_rate": 4.895446552787744e-06, "loss": 0.4977, "step": 10000 }, { "epoch": 0.11163551556818056, "grad_norm": 2.081350088119507, "learning_rate": 4.8928965799091134e-06, "loss": 0.5261, "step": 10100 }, { "epoch": 0.11274081770251898, "grad_norm": 2.022676944732666, "learning_rate": 4.890316565048624e-06, "loss": 0.4889, "step": 10200 }, { "epoch": 0.1138461198368574, "grad_norm": 1.5808357000350952, "learning_rate": 4.887706540597461e-06, "loss": 0.4929, "step": 10300 }, { "epoch": 0.11495142197119583, "grad_norm": 2.1185178756713867, "learning_rate": 4.8850665393235716e-06, "loss": 0.4575, "step": 10400 }, { "epoch": 0.11605672410553425, "grad_norm": 2.5382957458496094, "learning_rate": 4.8823965943712505e-06, "loss": 0.4979, "step": 10500 }, { "epoch": 0.11716202623987267, "grad_norm": 2.045133590698242, "learning_rate": 4.879696739260726e-06, "loss": 0.5215, "step": 10600 }, { "epoch": 0.11826732837421108, "grad_norm": 2.119107484817505, "learning_rate": 4.876967007887737e-06, "loss": 0.4754, "step": 10700 }, { "epoch": 0.11937263050854952, "grad_norm": 2.549633502960205, "learning_rate": 4.8742074345231076e-06, "loss": 0.5051, "step": 10800 }, { "epoch": 0.12047793264288793, "grad_norm": 3.1271703243255615, "learning_rate": 4.8714180538123205e-06, "loss": 0.5036, "step": 10900 }, { "epoch": 0.12158323477722635, "grad_norm": 1.8725048303604126, "learning_rate": 4.868598900775076e-06, "loss": 0.4766, "step": 11000 }, { "epoch": 0.12268853691156478, "grad_norm": 1.3768223524093628, "learning_rate": 4.865750010804857e-06, "loss": 0.4821, "step": 11100 }, { "epoch": 0.1237938390459032, "grad_norm": 2.7702245712280273, "learning_rate": 4.8628714196684854e-06, "loss": 0.5154, "step": 11200 }, { "epoch": 0.12489914118024162, "grad_norm": 2.6272552013397217, "learning_rate": 4.859963163505668e-06, "loss": 0.4747, "step": 11300 }, { "epoch": 0.12600444331458005, "grad_norm": 1.649949312210083, "learning_rate": 4.857025278828545e-06, "loss": 0.4836, "step": 11400 }, { "epoch": 0.12710974544891845, "grad_norm": 2.358071804046631, "learning_rate": 4.854057802521234e-06, "loss": 0.5184, "step": 11500 }, { "epoch": 0.12821504758325689, "grad_norm": 2.5856614112854004, "learning_rate": 4.851060771839367e-06, "loss": 0.4818, "step": 11600 }, { "epoch": 0.12932034971759532, "grad_norm": 1.8580783605575562, "learning_rate": 4.848034224409616e-06, "loss": 0.4887, "step": 11700 }, { "epoch": 0.13042565185193372, "grad_norm": 2.2157649993896484, "learning_rate": 4.84497819822923e-06, "loss": 0.5045, "step": 11800 }, { "epoch": 0.13153095398627215, "grad_norm": 1.4233261346817017, "learning_rate": 4.841892731665552e-06, "loss": 0.5147, "step": 11900 }, { "epoch": 0.13263625612061056, "grad_norm": 1.6375737190246582, "learning_rate": 4.838777863455537e-06, "loss": 0.4651, "step": 12000 }, { "epoch": 0.133741558254949, "grad_norm": 1.2430723905563354, "learning_rate": 4.835633632705269e-06, "loss": 0.4737, "step": 12100 }, { "epoch": 0.13484686038928742, "grad_norm": 2.4360849857330322, "learning_rate": 4.83246007888947e-06, "loss": 0.4936, "step": 12200 }, { "epoch": 0.13595216252362582, "grad_norm": 1.9232250452041626, "learning_rate": 4.8292572418509995e-06, "loss": 0.4763, "step": 12300 }, { "epoch": 0.13705746465796426, "grad_norm": 2.343539237976074, "learning_rate": 4.82602516180036e-06, "loss": 0.4956, "step": 12400 }, { "epoch": 0.1381627667923027, "grad_norm": 1.493943691253662, "learning_rate": 4.8227638793151875e-06, "loss": 0.4653, "step": 12500 }, { "epoch": 0.1392680689266411, "grad_norm": 3.257138729095459, "learning_rate": 4.819473435339748e-06, "loss": 0.4564, "step": 12600 }, { "epoch": 0.14037337106097952, "grad_norm": 1.8864688873291016, "learning_rate": 4.816153871184418e-06, "loss": 0.4667, "step": 12700 }, { "epoch": 0.14147867319531793, "grad_norm": 2.1740174293518066, "learning_rate": 4.812805228525166e-06, "loss": 0.4499, "step": 12800 }, { "epoch": 0.14258397532965636, "grad_norm": 1.5121800899505615, "learning_rate": 4.809427549403033e-06, "loss": 0.4933, "step": 12900 }, { "epoch": 0.1436892774639948, "grad_norm": 1.604945182800293, "learning_rate": 4.8060208762236025e-06, "loss": 0.479, "step": 13000 }, { "epoch": 0.1447945795983332, "grad_norm": 1.933350682258606, "learning_rate": 4.802585251756468e-06, "loss": 0.5105, "step": 13100 }, { "epoch": 0.14589988173267163, "grad_norm": 2.8999829292297363, "learning_rate": 4.799120719134696e-06, "loss": 0.4689, "step": 13200 }, { "epoch": 0.14700518386701006, "grad_norm": 2.4011030197143555, "learning_rate": 4.795627321854283e-06, "loss": 0.4709, "step": 13300 }, { "epoch": 0.14811048600134846, "grad_norm": 2.080972671508789, "learning_rate": 4.792105103773618e-06, "loss": 0.4893, "step": 13400 }, { "epoch": 0.1492157881356869, "grad_norm": 2.4878017902374268, "learning_rate": 4.788554109112918e-06, "loss": 0.5236, "step": 13500 }, { "epoch": 0.1503210902700253, "grad_norm": 2.1215240955352783, "learning_rate": 4.78497438245368e-06, "loss": 0.4817, "step": 13600 }, { "epoch": 0.15142639240436373, "grad_norm": 1.5228586196899414, "learning_rate": 4.781365968738126e-06, "loss": 0.4895, "step": 13700 }, { "epoch": 0.15253169453870216, "grad_norm": 2.399446487426758, "learning_rate": 4.777728913268632e-06, "loss": 0.4731, "step": 13800 }, { "epoch": 0.15363699667304057, "grad_norm": 2.1382806301116943, "learning_rate": 4.774063261707158e-06, "loss": 0.4981, "step": 13900 }, { "epoch": 0.154742298807379, "grad_norm": 1.590667486190796, "learning_rate": 4.770369060074685e-06, "loss": 0.4599, "step": 14000 }, { "epoch": 0.15584760094171743, "grad_norm": 1.882934331893921, "learning_rate": 4.766646354750621e-06, "loss": 0.5039, "step": 14100 }, { "epoch": 0.15695290307605583, "grad_norm": 1.8898316621780396, "learning_rate": 4.762895192472235e-06, "loss": 0.4758, "step": 14200 }, { "epoch": 0.15805820521039426, "grad_norm": 1.6479010581970215, "learning_rate": 4.759115620334062e-06, "loss": 0.493, "step": 14300 }, { "epoch": 0.1591635073447327, "grad_norm": 2.28085994720459, "learning_rate": 4.755307685787312e-06, "loss": 0.5221, "step": 14400 }, { "epoch": 0.1602688094790711, "grad_norm": 2.697305202484131, "learning_rate": 4.751471436639271e-06, "loss": 0.5172, "step": 14500 }, { "epoch": 0.16137411161340953, "grad_norm": 1.897016167640686, "learning_rate": 4.7476069210527135e-06, "loss": 0.5284, "step": 14600 }, { "epoch": 0.16247941374774794, "grad_norm": 2.659196376800537, "learning_rate": 4.743714187545282e-06, "loss": 0.4776, "step": 14700 }, { "epoch": 0.16358471588208637, "grad_norm": 1.7990115880966187, "learning_rate": 4.739793284988889e-06, "loss": 0.4506, "step": 14800 }, { "epoch": 0.1646900180164248, "grad_norm": 2.136432409286499, "learning_rate": 4.735844262609096e-06, "loss": 0.4775, "step": 14900 }, { "epoch": 0.1657953201507632, "grad_norm": 1.8059773445129395, "learning_rate": 4.731867169984506e-06, "loss": 0.4847, "step": 15000 }, { "epoch": 0.16690062228510164, "grad_norm": 1.7475543022155762, "learning_rate": 4.727862057046125e-06, "loss": 0.5092, "step": 15100 }, { "epoch": 0.16800592441944007, "grad_norm": 1.7633237838745117, "learning_rate": 4.723828974076752e-06, "loss": 0.4776, "step": 15200 }, { "epoch": 0.16911122655377847, "grad_norm": 1.973683476448059, "learning_rate": 4.719767971710335e-06, "loss": 0.4866, "step": 15300 }, { "epoch": 0.1702165286881169, "grad_norm": 2.3195412158966064, "learning_rate": 4.715679100931343e-06, "loss": 0.4784, "step": 15400 }, { "epoch": 0.1713218308224553, "grad_norm": 2.262366533279419, "learning_rate": 4.711562413074122e-06, "loss": 0.4494, "step": 15500 }, { "epoch": 0.17242713295679374, "grad_norm": 2.2675039768218994, "learning_rate": 4.707417959822252e-06, "loss": 0.5182, "step": 15600 }, { "epoch": 0.17353243509113217, "grad_norm": 2.6644225120544434, "learning_rate": 4.703245793207898e-06, "loss": 0.4819, "step": 15700 }, { "epoch": 0.17463773722547057, "grad_norm": 1.4928964376449585, "learning_rate": 4.699045965611157e-06, "loss": 0.4542, "step": 15800 }, { "epoch": 0.175743039359809, "grad_norm": 1.7893882989883423, "learning_rate": 4.694818529759399e-06, "loss": 0.4836, "step": 15900 }, { "epoch": 0.17684834149414744, "grad_norm": 1.5968459844589233, "learning_rate": 4.690563538726606e-06, "loss": 0.4702, "step": 16000 }, { "epoch": 0.17795364362848584, "grad_norm": 2.2333779335021973, "learning_rate": 4.686281045932707e-06, "loss": 0.4912, "step": 16100 }, { "epoch": 0.17905894576282427, "grad_norm": 1.1746132373809814, "learning_rate": 4.681971105142905e-06, "loss": 0.4935, "step": 16200 }, { "epoch": 0.18016424789716268, "grad_norm": 1.5028539896011353, "learning_rate": 4.677633770467003e-06, "loss": 0.4908, "step": 16300 }, { "epoch": 0.1812695500315011, "grad_norm": 1.9890942573547363, "learning_rate": 4.6732690963587256e-06, "loss": 0.4651, "step": 16400 }, { "epoch": 0.18237485216583954, "grad_norm": 2.262347459793091, "learning_rate": 4.668877137615032e-06, "loss": 0.496, "step": 16500 }, { "epoch": 0.18348015430017794, "grad_norm": 2.2725613117218018, "learning_rate": 4.664457949375434e-06, "loss": 0.4707, "step": 16600 }, { "epoch": 0.18458545643451638, "grad_norm": 2.965789794921875, "learning_rate": 4.660011587121297e-06, "loss": 0.4969, "step": 16700 }, { "epoch": 0.1856907585688548, "grad_norm": 1.5919311046600342, "learning_rate": 4.655538106675149e-06, "loss": 0.4985, "step": 16800 }, { "epoch": 0.1867960607031932, "grad_norm": 2.4821956157684326, "learning_rate": 4.651037564199977e-06, "loss": 0.4878, "step": 16900 }, { "epoch": 0.18790136283753164, "grad_norm": 1.9851549863815308, "learning_rate": 4.646510016198521e-06, "loss": 0.4778, "step": 17000 }, { "epoch": 0.18900666497187005, "grad_norm": 1.9277724027633667, "learning_rate": 4.641955519512567e-06, "loss": 0.5302, "step": 17100 }, { "epoch": 0.19011196710620848, "grad_norm": 2.289950132369995, "learning_rate": 4.637374131322232e-06, "loss": 0.4646, "step": 17200 }, { "epoch": 0.1912172692405469, "grad_norm": 2.9119439125061035, "learning_rate": 4.632765909145247e-06, "loss": 0.5033, "step": 17300 }, { "epoch": 0.19232257137488531, "grad_norm": 1.9241691827774048, "learning_rate": 4.628130910836234e-06, "loss": 0.4879, "step": 17400 }, { "epoch": 0.19342787350922375, "grad_norm": 1.1978574991226196, "learning_rate": 4.623469194585979e-06, "loss": 0.4675, "step": 17500 }, { "epoch": 0.19453317564356218, "grad_norm": 1.6705842018127441, "learning_rate": 4.618780818920705e-06, "loss": 0.4605, "step": 17600 }, { "epoch": 0.19563847777790058, "grad_norm": 2.020331859588623, "learning_rate": 4.614065842701332e-06, "loss": 0.4974, "step": 17700 }, { "epoch": 0.19674377991223901, "grad_norm": 2.0887222290039062, "learning_rate": 4.609324325122743e-06, "loss": 0.4736, "step": 17800 }, { "epoch": 0.19784908204657745, "grad_norm": 2.283088445663452, "learning_rate": 4.604556325713035e-06, "loss": 0.4985, "step": 17900 }, { "epoch": 0.19895438418091585, "grad_norm": 2.186509132385254, "learning_rate": 4.599761904332778e-06, "loss": 0.4767, "step": 18000 }, { "epoch": 0.20005968631525428, "grad_norm": 2.262012243270874, "learning_rate": 4.594941121174262e-06, "loss": 0.4697, "step": 18100 }, { "epoch": 0.20116498844959269, "grad_norm": 1.634402871131897, "learning_rate": 4.590094036760736e-06, "loss": 0.4939, "step": 18200 }, { "epoch": 0.20227029058393112, "grad_norm": 1.883914589881897, "learning_rate": 4.5852207119456555e-06, "loss": 0.47, "step": 18300 }, { "epoch": 0.20337559271826955, "grad_norm": 2.231407880783081, "learning_rate": 4.580321207911912e-06, "loss": 0.4815, "step": 18400 }, { "epoch": 0.20448089485260795, "grad_norm": 2.605910539627075, "learning_rate": 4.57539558617107e-06, "loss": 0.5328, "step": 18500 }, { "epoch": 0.20558619698694638, "grad_norm": 1.1122691631317139, "learning_rate": 4.570443908562593e-06, "loss": 0.4606, "step": 18600 }, { "epoch": 0.20669149912128482, "grad_norm": 1.9738783836364746, "learning_rate": 4.565466237253066e-06, "loss": 0.4612, "step": 18700 }, { "epoch": 0.20779680125562322, "grad_norm": 3.1255314350128174, "learning_rate": 4.560462634735416e-06, "loss": 0.469, "step": 18800 }, { "epoch": 0.20890210338996165, "grad_norm": 2.3683340549468994, "learning_rate": 4.555433163828126e-06, "loss": 0.4997, "step": 18900 }, { "epoch": 0.21000740552430006, "grad_norm": 2.482985496520996, "learning_rate": 4.55037788767445e-06, "loss": 0.5105, "step": 19000 }, { "epoch": 0.2111127076586385, "grad_norm": 1.7868962287902832, "learning_rate": 4.545296869741616e-06, "loss": 0.4899, "step": 19100 }, { "epoch": 0.21221800979297692, "grad_norm": 1.6937700510025024, "learning_rate": 4.540190173820033e-06, "loss": 0.5029, "step": 19200 }, { "epoch": 0.21332331192731532, "grad_norm": 1.6983795166015625, "learning_rate": 4.535057864022486e-06, "loss": 0.5273, "step": 19300 }, { "epoch": 0.21442861406165376, "grad_norm": 1.446453332901001, "learning_rate": 4.529900004783334e-06, "loss": 0.4864, "step": 19400 }, { "epoch": 0.2155339161959922, "grad_norm": 2.247065305709839, "learning_rate": 4.524716660857701e-06, "loss": 0.4805, "step": 19500 }, { "epoch": 0.2166392183303306, "grad_norm": 1.6583445072174072, "learning_rate": 4.519507897320662e-06, "loss": 0.4631, "step": 19600 }, { "epoch": 0.21774452046466902, "grad_norm": 1.718631625175476, "learning_rate": 4.514273779566426e-06, "loss": 0.4893, "step": 19700 }, { "epoch": 0.21884982259900743, "grad_norm": 1.6608977317810059, "learning_rate": 4.509014373307515e-06, "loss": 0.483, "step": 19800 }, { "epoch": 0.21995512473334586, "grad_norm": 2.0695135593414307, "learning_rate": 4.503729744573943e-06, "loss": 0.5042, "step": 19900 }, { "epoch": 0.2210604268676843, "grad_norm": 1.75504469871521, "learning_rate": 4.498419959712376e-06, "loss": 0.4844, "step": 20000 }, { "epoch": 0.2221657290020227, "grad_norm": 3.0820794105529785, "learning_rate": 4.493085085385314e-06, "loss": 0.4775, "step": 20100 }, { "epoch": 0.22327103113636113, "grad_norm": 2.3822927474975586, "learning_rate": 4.487725188570241e-06, "loss": 0.4563, "step": 20200 }, { "epoch": 0.22437633327069956, "grad_norm": 2.8337135314941406, "learning_rate": 4.482340336558793e-06, "loss": 0.4712, "step": 20300 }, { "epoch": 0.22548163540503796, "grad_norm": 2.8210105895996094, "learning_rate": 4.476930596955909e-06, "loss": 0.5026, "step": 20400 }, { "epoch": 0.2265869375393764, "grad_norm": 2.012446165084839, "learning_rate": 4.471496037678982e-06, "loss": 0.4728, "step": 20500 }, { "epoch": 0.2276922396737148, "grad_norm": 2.477320432662964, "learning_rate": 4.466036726957008e-06, "loss": 0.5243, "step": 20600 }, { "epoch": 0.22879754180805323, "grad_norm": 2.1189372539520264, "learning_rate": 4.460552733329729e-06, "loss": 0.4414, "step": 20700 }, { "epoch": 0.22990284394239166, "grad_norm": 1.6811827421188354, "learning_rate": 4.455044125646773e-06, "loss": 0.4606, "step": 20800 }, { "epoch": 0.23100814607673006, "grad_norm": 1.8918300867080688, "learning_rate": 4.449510973066785e-06, "loss": 0.4587, "step": 20900 }, { "epoch": 0.2321134482110685, "grad_norm": 1.6469461917877197, "learning_rate": 4.44395334505657e-06, "loss": 0.4811, "step": 21000 }, { "epoch": 0.23321875034540693, "grad_norm": 1.0091384649276733, "learning_rate": 4.438371311390205e-06, "loss": 0.4469, "step": 21100 }, { "epoch": 0.23432405247974533, "grad_norm": 1.67509126663208, "learning_rate": 4.432764942148177e-06, "loss": 0.4812, "step": 21200 }, { "epoch": 0.23542935461408376, "grad_norm": 2.054719924926758, "learning_rate": 4.427134307716496e-06, "loss": 0.4343, "step": 21300 }, { "epoch": 0.23653465674842217, "grad_norm": 2.0753352642059326, "learning_rate": 4.421479478785814e-06, "loss": 0.4677, "step": 21400 }, { "epoch": 0.2376399588827606, "grad_norm": 1.5594350099563599, "learning_rate": 4.415800526350535e-06, "loss": 0.475, "step": 21500 }, { "epoch": 0.23874526101709903, "grad_norm": 2.458397626876831, "learning_rate": 4.410097521707926e-06, "loss": 0.4943, "step": 21600 }, { "epoch": 0.23985056315143743, "grad_norm": 2.180816888809204, "learning_rate": 4.404370536457221e-06, "loss": 0.4361, "step": 21700 }, { "epoch": 0.24095586528577587, "grad_norm": 2.4106123447418213, "learning_rate": 4.3986196424987216e-06, "loss": 0.5065, "step": 21800 }, { "epoch": 0.2420611674201143, "grad_norm": 2.228212833404541, "learning_rate": 4.392844912032896e-06, "loss": 0.4892, "step": 21900 }, { "epoch": 0.2431664695544527, "grad_norm": 2.2582526206970215, "learning_rate": 4.387046417559471e-06, "loss": 0.443, "step": 22000 }, { "epoch": 0.24427177168879113, "grad_norm": 3.1825761795043945, "learning_rate": 4.381224231876521e-06, "loss": 0.4607, "step": 22100 }, { "epoch": 0.24537707382312957, "grad_norm": 1.9606397151947021, "learning_rate": 4.375378428079557e-06, "loss": 0.4431, "step": 22200 }, { "epoch": 0.24648237595746797, "grad_norm": 1.9158498048782349, "learning_rate": 4.369509079560608e-06, "loss": 0.4923, "step": 22300 }, { "epoch": 0.2475876780918064, "grad_norm": 2.624380111694336, "learning_rate": 4.363616260007294e-06, "loss": 0.4632, "step": 22400 }, { "epoch": 0.2486929802261448, "grad_norm": 1.440521001815796, "learning_rate": 4.357700043401912e-06, "loss": 0.4798, "step": 22500 }, { "epoch": 0.24979828236048324, "grad_norm": 2.1393532752990723, "learning_rate": 4.351760504020496e-06, "loss": 0.459, "step": 22600 }, { "epoch": 0.25090358449482164, "grad_norm": 1.950707197189331, "learning_rate": 4.345797716431891e-06, "loss": 0.5176, "step": 22700 }, { "epoch": 0.2520088866291601, "grad_norm": 2.3011667728424072, "learning_rate": 4.339811755496817e-06, "loss": 0.4838, "step": 22800 }, { "epoch": 0.2531141887634985, "grad_norm": 1.6088446378707886, "learning_rate": 4.333802696366923e-06, "loss": 0.4588, "step": 22900 }, { "epoch": 0.2542194908978369, "grad_norm": 1.790541410446167, "learning_rate": 4.327770614483853e-06, "loss": 0.4824, "step": 23000 }, { "epoch": 0.25532479303217537, "grad_norm": 2.6423535346984863, "learning_rate": 4.321715585578289e-06, "loss": 0.4589, "step": 23100 }, { "epoch": 0.25643009516651377, "grad_norm": 1.4211223125457764, "learning_rate": 4.315637685669006e-06, "loss": 0.4483, "step": 23200 }, { "epoch": 0.2575353973008522, "grad_norm": 1.9869434833526611, "learning_rate": 4.30953699106192e-06, "loss": 0.4658, "step": 23300 }, { "epoch": 0.25864069943519064, "grad_norm": 1.8357223272323608, "learning_rate": 4.303413578349122e-06, "loss": 0.4697, "step": 23400 }, { "epoch": 0.25974600156952904, "grad_norm": 1.6129013299942017, "learning_rate": 4.2972675244079224e-06, "loss": 0.4612, "step": 23500 }, { "epoch": 0.26085130370386744, "grad_norm": 1.8021016120910645, "learning_rate": 4.291098906399885e-06, "loss": 0.4536, "step": 23600 }, { "epoch": 0.26195660583820585, "grad_norm": 1.4587496519088745, "learning_rate": 4.2849078017698565e-06, "loss": 0.4347, "step": 23700 }, { "epoch": 0.2630619079725443, "grad_norm": 2.1143853664398193, "learning_rate": 4.2786942882449965e-06, "loss": 0.4478, "step": 23800 }, { "epoch": 0.2641672101068827, "grad_norm": 1.9837020635604858, "learning_rate": 4.272458443833801e-06, "loss": 0.4586, "step": 23900 }, { "epoch": 0.2652725122412211, "grad_norm": 1.6629817485809326, "learning_rate": 4.266200346825119e-06, "loss": 0.4609, "step": 24000 }, { "epoch": 0.2663778143755596, "grad_norm": 2.2694997787475586, "learning_rate": 4.259920075787177e-06, "loss": 0.4506, "step": 24100 }, { "epoch": 0.267483116509898, "grad_norm": 2.3292577266693115, "learning_rate": 4.253617709566588e-06, "loss": 0.4517, "step": 24200 }, { "epoch": 0.2685884186442364, "grad_norm": 2.215757369995117, "learning_rate": 4.247293327287359e-06, "loss": 0.4598, "step": 24300 }, { "epoch": 0.26969372077857484, "grad_norm": 2.3665645122528076, "learning_rate": 4.240947008349905e-06, "loss": 0.4926, "step": 24400 }, { "epoch": 0.27079902291291325, "grad_norm": 2.2286605834960938, "learning_rate": 4.234578832430047e-06, "loss": 0.4665, "step": 24500 }, { "epoch": 0.27190432504725165, "grad_norm": 2.3083527088165283, "learning_rate": 4.228188879478011e-06, "loss": 0.4841, "step": 24600 }, { "epoch": 0.2730096271815901, "grad_norm": 1.8674919605255127, "learning_rate": 4.221777229717428e-06, "loss": 0.464, "step": 24700 }, { "epoch": 0.2741149293159285, "grad_norm": 2.442124605178833, "learning_rate": 4.215343963644324e-06, "loss": 0.4462, "step": 24800 }, { "epoch": 0.2752202314502669, "grad_norm": 1.761814832687378, "learning_rate": 4.2088891620261106e-06, "loss": 0.4811, "step": 24900 }, { "epoch": 0.2763255335846054, "grad_norm": 1.81318998336792, "learning_rate": 4.20241290590057e-06, "loss": 0.4819, "step": 25000 }, { "epoch": 0.2774308357189438, "grad_norm": 2.6324472427368164, "learning_rate": 4.1959152765748405e-06, "loss": 0.4942, "step": 25100 }, { "epoch": 0.2785361378532822, "grad_norm": 1.9197957515716553, "learning_rate": 4.189396355624389e-06, "loss": 0.4411, "step": 25200 }, { "epoch": 0.27964143998762064, "grad_norm": 2.736686944961548, "learning_rate": 4.182856224891997e-06, "loss": 0.4679, "step": 25300 }, { "epoch": 0.28074674212195905, "grad_norm": 1.2711482048034668, "learning_rate": 4.176294966486722e-06, "loss": 0.4621, "step": 25400 }, { "epoch": 0.28185204425629745, "grad_norm": 2.046609401702881, "learning_rate": 4.169712662782876e-06, "loss": 0.4733, "step": 25500 }, { "epoch": 0.28295734639063586, "grad_norm": 1.6701066493988037, "learning_rate": 4.163109396418986e-06, "loss": 0.4771, "step": 25600 }, { "epoch": 0.2840626485249743, "grad_norm": 1.8547199964523315, "learning_rate": 4.156485250296757e-06, "loss": 0.4596, "step": 25700 }, { "epoch": 0.2851679506593127, "grad_norm": 2.2946977615356445, "learning_rate": 4.149840307580033e-06, "loss": 0.4497, "step": 25800 }, { "epoch": 0.2862732527936511, "grad_norm": 2.6851511001586914, "learning_rate": 4.143174651693753e-06, "loss": 0.4497, "step": 25900 }, { "epoch": 0.2873785549279896, "grad_norm": 2.5896623134613037, "learning_rate": 4.1364883663229e-06, "loss": 0.4664, "step": 26000 }, { "epoch": 0.288483857062328, "grad_norm": 2.0162718296051025, "learning_rate": 4.129781535411456e-06, "loss": 0.4614, "step": 26100 }, { "epoch": 0.2895891591966664, "grad_norm": 2.3387439250946045, "learning_rate": 4.123054243161342e-06, "loss": 0.4867, "step": 26200 }, { "epoch": 0.29069446133100485, "grad_norm": 2.132131338119507, "learning_rate": 4.116306574031366e-06, "loss": 0.4741, "step": 26300 }, { "epoch": 0.29179976346534325, "grad_norm": 1.7863556146621704, "learning_rate": 4.109538612736161e-06, "loss": 0.4492, "step": 26400 }, { "epoch": 0.29290506559968166, "grad_norm": 2.3342113494873047, "learning_rate": 4.10275044424512e-06, "loss": 0.47, "step": 26500 }, { "epoch": 0.2940103677340201, "grad_norm": 2.0262320041656494, "learning_rate": 4.095942153781329e-06, "loss": 0.4635, "step": 26600 }, { "epoch": 0.2951156698683585, "grad_norm": 2.9538447856903076, "learning_rate": 4.0891138268205025e-06, "loss": 0.4477, "step": 26700 }, { "epoch": 0.2962209720026969, "grad_norm": 2.5609724521636963, "learning_rate": 4.082265549089902e-06, "loss": 0.4546, "step": 26800 }, { "epoch": 0.2973262741370354, "grad_norm": 2.4035484790802, "learning_rate": 4.075397406567265e-06, "loss": 0.494, "step": 26900 }, { "epoch": 0.2984315762713738, "grad_norm": 1.2948765754699707, "learning_rate": 4.068509485479726e-06, "loss": 0.485, "step": 27000 }, { "epoch": 0.2995368784057122, "grad_norm": 1.7401434183120728, "learning_rate": 4.061601872302732e-06, "loss": 0.4451, "step": 27100 }, { "epoch": 0.3006421805400506, "grad_norm": 1.718982219696045, "learning_rate": 4.054674653758956e-06, "loss": 0.4837, "step": 27200 }, { "epoch": 0.30174748267438906, "grad_norm": 2.159252166748047, "learning_rate": 4.047727916817211e-06, "loss": 0.4709, "step": 27300 }, { "epoch": 0.30285278480872746, "grad_norm": 1.9981988668441772, "learning_rate": 4.040761748691356e-06, "loss": 0.468, "step": 27400 }, { "epoch": 0.30395808694306586, "grad_norm": 2.0982799530029297, "learning_rate": 4.033776236839202e-06, "loss": 0.4637, "step": 27500 }, { "epoch": 0.3050633890774043, "grad_norm": 2.9962141513824463, "learning_rate": 4.0267714689614124e-06, "loss": 0.4695, "step": 27600 }, { "epoch": 0.3061686912117427, "grad_norm": 2.803635597229004, "learning_rate": 4.019747533000405e-06, "loss": 0.4771, "step": 27700 }, { "epoch": 0.30727399334608113, "grad_norm": 1.8022634983062744, "learning_rate": 4.012704517139248e-06, "loss": 0.4672, "step": 27800 }, { "epoch": 0.3083792954804196, "grad_norm": 1.9764262437820435, "learning_rate": 4.005642509800545e-06, "loss": 0.4842, "step": 27900 }, { "epoch": 0.309484597614758, "grad_norm": 2.3172965049743652, "learning_rate": 3.998561599645338e-06, "loss": 0.4747, "step": 28000 }, { "epoch": 0.3105898997490964, "grad_norm": 3.117851972579956, "learning_rate": 3.9914618755719816e-06, "loss": 0.4857, "step": 28100 }, { "epoch": 0.31169520188343486, "grad_norm": 2.1363372802734375, "learning_rate": 3.984343426715036e-06, "loss": 0.4405, "step": 28200 }, { "epoch": 0.31280050401777326, "grad_norm": 2.1967580318450928, "learning_rate": 3.977206342444144e-06, "loss": 0.4626, "step": 28300 }, { "epoch": 0.31390580615211167, "grad_norm": 1.6863844394683838, "learning_rate": 3.970050712362908e-06, "loss": 0.4505, "step": 28400 }, { "epoch": 0.3150111082864501, "grad_norm": 2.1374428272247314, "learning_rate": 3.962876626307769e-06, "loss": 0.4522, "step": 28500 }, { "epoch": 0.31611641042078853, "grad_norm": 2.230015754699707, "learning_rate": 3.955684174346872e-06, "loss": 0.4331, "step": 28600 }, { "epoch": 0.31722171255512693, "grad_norm": 2.7188756465911865, "learning_rate": 3.948473446778947e-06, "loss": 0.4788, "step": 28700 }, { "epoch": 0.3183270146894654, "grad_norm": 1.7964341640472412, "learning_rate": 3.94124453413216e-06, "loss": 0.4442, "step": 28800 }, { "epoch": 0.3194323168238038, "grad_norm": 1.4361404180526733, "learning_rate": 3.933997527162987e-06, "loss": 0.4868, "step": 28900 }, { "epoch": 0.3205376189581422, "grad_norm": 2.0563929080963135, "learning_rate": 3.926732516855075e-06, "loss": 0.4921, "step": 29000 }, { "epoch": 0.3216429210924806, "grad_norm": 1.55277419090271, "learning_rate": 3.919449594418094e-06, "loss": 0.4877, "step": 29100 }, { "epoch": 0.32274822322681906, "grad_norm": 2.299819231033325, "learning_rate": 3.912148851286593e-06, "loss": 0.468, "step": 29200 }, { "epoch": 0.32385352536115747, "grad_norm": 1.409555435180664, "learning_rate": 3.904830379118857e-06, "loss": 0.4279, "step": 29300 }, { "epoch": 0.32495882749549587, "grad_norm": 1.9166666269302368, "learning_rate": 3.89749426979575e-06, "loss": 0.4732, "step": 29400 }, { "epoch": 0.32606412962983433, "grad_norm": 2.2752537727355957, "learning_rate": 3.890140615419566e-06, "loss": 0.4605, "step": 29500 }, { "epoch": 0.32716943176417274, "grad_norm": 1.6896592378616333, "learning_rate": 3.882769508312871e-06, "loss": 0.4513, "step": 29600 }, { "epoch": 0.32827473389851114, "grad_norm": 1.8940850496292114, "learning_rate": 3.875381041017343e-06, "loss": 0.4665, "step": 29700 }, { "epoch": 0.3293800360328496, "grad_norm": 2.7840423583984375, "learning_rate": 3.867975306292612e-06, "loss": 0.472, "step": 29800 }, { "epoch": 0.330485338167188, "grad_norm": 1.7090684175491333, "learning_rate": 3.860552397115093e-06, "loss": 0.4239, "step": 29900 }, { "epoch": 0.3315906403015264, "grad_norm": 1.5519531965255737, "learning_rate": 3.853112406676823e-06, "loss": 0.4537, "step": 30000 }, { "epoch": 0.33269594243586487, "grad_norm": 2.7194883823394775, "learning_rate": 3.845655428384286e-06, "loss": 0.5102, "step": 30100 }, { "epoch": 0.33380124457020327, "grad_norm": 2.118680000305176, "learning_rate": 3.838181555857243e-06, "loss": 0.4915, "step": 30200 }, { "epoch": 0.3349065467045417, "grad_norm": 2.484039545059204, "learning_rate": 3.830690882927558e-06, "loss": 0.4603, "step": 30300 }, { "epoch": 0.33601184883888013, "grad_norm": 2.0341908931732178, "learning_rate": 3.823183503638014e-06, "loss": 0.4684, "step": 30400 }, { "epoch": 0.33711715097321854, "grad_norm": 0.9588632583618164, "learning_rate": 3.815659512241141e-06, "loss": 0.4963, "step": 30500 }, { "epoch": 0.33822245310755694, "grad_norm": 2.8853650093078613, "learning_rate": 3.8081190031980266e-06, "loss": 0.4801, "step": 30600 }, { "epoch": 0.33932775524189535, "grad_norm": 1.7053953409194946, "learning_rate": 3.8005620711771318e-06, "loss": 0.4591, "step": 30700 }, { "epoch": 0.3404330573762338, "grad_norm": 2.16013765335083, "learning_rate": 3.7929888110530998e-06, "loss": 0.4598, "step": 30800 }, { "epoch": 0.3415383595105722, "grad_norm": 2.3963918685913086, "learning_rate": 3.7853993179055724e-06, "loss": 0.4681, "step": 30900 }, { "epoch": 0.3426436616449106, "grad_norm": 3.2389566898345947, "learning_rate": 3.7777936870179873e-06, "loss": 0.4717, "step": 31000 }, { "epoch": 0.3437489637792491, "grad_norm": 2.17598032951355, "learning_rate": 3.7701720138763877e-06, "loss": 0.4573, "step": 31100 }, { "epoch": 0.3448542659135875, "grad_norm": 2.4974260330200195, "learning_rate": 3.7625343941682203e-06, "loss": 0.4681, "step": 31200 }, { "epoch": 0.3459595680479259, "grad_norm": 2.331465721130371, "learning_rate": 3.7548809237811378e-06, "loss": 0.4953, "step": 31300 }, { "epoch": 0.34706487018226434, "grad_norm": 1.782915711402893, "learning_rate": 3.7472116988017906e-06, "loss": 0.4257, "step": 31400 }, { "epoch": 0.34817017231660274, "grad_norm": 1.96134352684021, "learning_rate": 3.7395268155146232e-06, "loss": 0.4489, "step": 31500 }, { "epoch": 0.34927547445094115, "grad_norm": 1.6746424436569214, "learning_rate": 3.731826370400663e-06, "loss": 0.4748, "step": 31600 }, { "epoch": 0.3503807765852796, "grad_norm": 1.7693666219711304, "learning_rate": 3.7241104601363154e-06, "loss": 0.4783, "step": 31700 }, { "epoch": 0.351486078719618, "grad_norm": 1.4009222984313965, "learning_rate": 3.7163791815921394e-06, "loss": 0.4648, "step": 31800 }, { "epoch": 0.3525913808539564, "grad_norm": 2.408993721008301, "learning_rate": 3.708632631831643e-06, "loss": 0.4382, "step": 31900 }, { "epoch": 0.3536966829882949, "grad_norm": 1.713916540145874, "learning_rate": 3.7008709081100537e-06, "loss": 0.4258, "step": 32000 }, { "epoch": 0.3548019851226333, "grad_norm": 2.0615127086639404, "learning_rate": 3.6930941078731065e-06, "loss": 0.4874, "step": 32100 }, { "epoch": 0.3559072872569717, "grad_norm": 2.3877241611480713, "learning_rate": 3.685302328755815e-06, "loss": 0.507, "step": 32200 }, { "epoch": 0.35701258939131014, "grad_norm": 2.4597456455230713, "learning_rate": 3.6774956685812496e-06, "loss": 0.4513, "step": 32300 }, { "epoch": 0.35811789152564855, "grad_norm": 2.5451297760009766, "learning_rate": 3.6696742253593035e-06, "loss": 0.4419, "step": 32400 }, { "epoch": 0.35922319365998695, "grad_norm": 2.2447433471679688, "learning_rate": 3.6618380972854694e-06, "loss": 0.4669, "step": 32500 }, { "epoch": 0.36032849579432535, "grad_norm": 1.7082650661468506, "learning_rate": 3.6539873827396023e-06, "loss": 0.4352, "step": 32600 }, { "epoch": 0.3614337979286638, "grad_norm": 1.607082486152649, "learning_rate": 3.646122180284683e-06, "loss": 0.4595, "step": 32700 }, { "epoch": 0.3625391000630022, "grad_norm": 1.835105299949646, "learning_rate": 3.638242588665587e-06, "loss": 0.4674, "step": 32800 }, { "epoch": 0.3636444021973406, "grad_norm": 1.7002040147781372, "learning_rate": 3.630348706807836e-06, "loss": 0.4746, "step": 32900 }, { "epoch": 0.3647497043316791, "grad_norm": 2.184178590774536, "learning_rate": 3.622440633816366e-06, "loss": 0.4388, "step": 33000 }, { "epoch": 0.3658550064660175, "grad_norm": 2.1649866104125977, "learning_rate": 3.6145184689742716e-06, "loss": 0.4499, "step": 33100 }, { "epoch": 0.3669603086003559, "grad_norm": 1.3153752088546753, "learning_rate": 3.6065823117415716e-06, "loss": 0.4391, "step": 33200 }, { "epoch": 0.36806561073469435, "grad_norm": 1.944061279296875, "learning_rate": 3.5986322617539506e-06, "loss": 0.4833, "step": 33300 }, { "epoch": 0.36917091286903275, "grad_norm": 1.6162335872650146, "learning_rate": 3.590668418821513e-06, "loss": 0.4889, "step": 33400 }, { "epoch": 0.37027621500337116, "grad_norm": 1.623404622077942, "learning_rate": 3.5826908829275296e-06, "loss": 0.4698, "step": 33500 }, { "epoch": 0.3713815171377096, "grad_norm": 1.830082654953003, "learning_rate": 3.57469975422718e-06, "loss": 0.507, "step": 33600 }, { "epoch": 0.372486819272048, "grad_norm": 2.138823986053467, "learning_rate": 3.5666951330462972e-06, "loss": 0.4419, "step": 33700 }, { "epoch": 0.3735921214063864, "grad_norm": 2.455385208129883, "learning_rate": 3.558677119880109e-06, "loss": 0.4729, "step": 33800 }, { "epoch": 0.3746974235407249, "grad_norm": 3.052379846572876, "learning_rate": 3.550645815391973e-06, "loss": 0.447, "step": 33900 }, { "epoch": 0.3758027256750633, "grad_norm": 1.8502277135849, "learning_rate": 3.542601320412116e-06, "loss": 0.4545, "step": 34000 }, { "epoch": 0.3769080278094017, "grad_norm": 2.621030569076538, "learning_rate": 3.534543735936366e-06, "loss": 0.4832, "step": 34100 }, { "epoch": 0.3780133299437401, "grad_norm": 1.681999683380127, "learning_rate": 3.5264731631248867e-06, "loss": 0.4813, "step": 34200 }, { "epoch": 0.37911863207807855, "grad_norm": 1.8637994527816772, "learning_rate": 3.5183897033009018e-06, "loss": 0.5013, "step": 34300 }, { "epoch": 0.38022393421241696, "grad_norm": 1.9797747135162354, "learning_rate": 3.510293457949433e-06, "loss": 0.4473, "step": 34400 }, { "epoch": 0.38132923634675536, "grad_norm": 2.2267913818359375, "learning_rate": 3.502184528716013e-06, "loss": 0.455, "step": 34500 }, { "epoch": 0.3824345384810938, "grad_norm": 1.919852375984192, "learning_rate": 3.494063017405423e-06, "loss": 0.447, "step": 34600 }, { "epoch": 0.3835398406154322, "grad_norm": 2.838737964630127, "learning_rate": 3.485929025980402e-06, "loss": 0.4447, "step": 34700 }, { "epoch": 0.38464514274977063, "grad_norm": 1.7883715629577637, "learning_rate": 3.477782656560377e-06, "loss": 0.4897, "step": 34800 }, { "epoch": 0.3857504448841091, "grad_norm": 1.9990206956863403, "learning_rate": 3.469624011420173e-06, "loss": 0.4533, "step": 34900 }, { "epoch": 0.3868557470184475, "grad_norm": 3.673203706741333, "learning_rate": 3.461453192988734e-06, "loss": 0.4813, "step": 35000 }, { "epoch": 0.3879610491527859, "grad_norm": 1.820590853691101, "learning_rate": 3.4532703038478368e-06, "loss": 0.4582, "step": 35100 }, { "epoch": 0.38906635128712436, "grad_norm": 1.6964892148971558, "learning_rate": 3.445075446730798e-06, "loss": 0.4355, "step": 35200 }, { "epoch": 0.39017165342146276, "grad_norm": 2.7785258293151855, "learning_rate": 3.4368687245211914e-06, "loss": 0.4744, "step": 35300 }, { "epoch": 0.39127695555580116, "grad_norm": 2.661006212234497, "learning_rate": 3.4286502402515504e-06, "loss": 0.4512, "step": 35400 }, { "epoch": 0.3923822576901396, "grad_norm": 1.379711389541626, "learning_rate": 3.4204200971020796e-06, "loss": 0.4727, "step": 35500 }, { "epoch": 0.39348755982447803, "grad_norm": 2.01283860206604, "learning_rate": 3.412178398399355e-06, "loss": 0.4774, "step": 35600 }, { "epoch": 0.39459286195881643, "grad_norm": 1.920944094657898, "learning_rate": 3.4039252476150284e-06, "loss": 0.4775, "step": 35700 }, { "epoch": 0.3956981640931549, "grad_norm": 1.920350193977356, "learning_rate": 3.39566074836453e-06, "loss": 0.4526, "step": 35800 }, { "epoch": 0.3968034662274933, "grad_norm": 2.782977819442749, "learning_rate": 3.3873850044057633e-06, "loss": 0.4541, "step": 35900 }, { "epoch": 0.3979087683618317, "grad_norm": 2.4611635208129883, "learning_rate": 3.3790981196378086e-06, "loss": 0.4964, "step": 36000 }, { "epoch": 0.3990140704961701, "grad_norm": 1.8741673231124878, "learning_rate": 3.370800198099613e-06, "loss": 0.435, "step": 36100 }, { "epoch": 0.40011937263050856, "grad_norm": 1.919241189956665, "learning_rate": 3.362491343968687e-06, "loss": 0.4386, "step": 36200 }, { "epoch": 0.40122467476484697, "grad_norm": 2.52968168258667, "learning_rate": 3.3541716615597948e-06, "loss": 0.4545, "step": 36300 }, { "epoch": 0.40232997689918537, "grad_norm": 2.964994430541992, "learning_rate": 3.3458412553236475e-06, "loss": 0.4551, "step": 36400 }, { "epoch": 0.40343527903352383, "grad_norm": 2.7886335849761963, "learning_rate": 3.337500229845592e-06, "loss": 0.477, "step": 36500 }, { "epoch": 0.40454058116786223, "grad_norm": 1.9467898607254028, "learning_rate": 3.329148689844289e-06, "loss": 0.4546, "step": 36600 }, { "epoch": 0.40564588330220064, "grad_norm": 1.1720269918441772, "learning_rate": 3.320786740170414e-06, "loss": 0.4759, "step": 36700 }, { "epoch": 0.4067511854365391, "grad_norm": 2.1939995288848877, "learning_rate": 3.3124144858053252e-06, "loss": 0.4456, "step": 36800 }, { "epoch": 0.4078564875708775, "grad_norm": 2.350830078125, "learning_rate": 3.304032031859759e-06, "loss": 0.4683, "step": 36900 }, { "epoch": 0.4089617897052159, "grad_norm": 2.4557292461395264, "learning_rate": 3.295639483572498e-06, "loss": 0.4415, "step": 37000 }, { "epoch": 0.41006709183955437, "grad_norm": 1.3871397972106934, "learning_rate": 3.287236946309059e-06, "loss": 0.4635, "step": 37100 }, { "epoch": 0.41117239397389277, "grad_norm": 2.129850387573242, "learning_rate": 3.2788245255603675e-06, "loss": 0.4888, "step": 37200 }, { "epoch": 0.4122776961082312, "grad_norm": 1.527912974357605, "learning_rate": 3.2704023269414304e-06, "loss": 0.4848, "step": 37300 }, { "epoch": 0.41338299824256963, "grad_norm": 1.9338812828063965, "learning_rate": 3.261970456190014e-06, "loss": 0.5031, "step": 37400 }, { "epoch": 0.41448830037690804, "grad_norm": 1.9333993196487427, "learning_rate": 3.253529019165314e-06, "loss": 0.4533, "step": 37500 }, { "epoch": 0.41559360251124644, "grad_norm": 2.1915063858032227, "learning_rate": 3.2450781218466274e-06, "loss": 0.4508, "step": 37600 }, { "epoch": 0.41669890464558484, "grad_norm": 2.150376319885254, "learning_rate": 3.2366178703320232e-06, "loss": 0.4359, "step": 37700 }, { "epoch": 0.4178042067799233, "grad_norm": 2.5346415042877197, "learning_rate": 3.2281483708370074e-06, "loss": 0.474, "step": 37800 }, { "epoch": 0.4189095089142617, "grad_norm": 2.2632484436035156, "learning_rate": 3.2196697296931915e-06, "loss": 0.4317, "step": 37900 }, { "epoch": 0.4200148110486001, "grad_norm": 2.7014644145965576, "learning_rate": 3.2111820533469577e-06, "loss": 0.4493, "step": 38000 }, { "epoch": 0.42112011318293857, "grad_norm": 1.923828363418579, "learning_rate": 3.202685448358122e-06, "loss": 0.4884, "step": 38100 }, { "epoch": 0.422225415317277, "grad_norm": 2.4021315574645996, "learning_rate": 3.1941800213985964e-06, "loss": 0.4457, "step": 38200 }, { "epoch": 0.4233307174516154, "grad_norm": 1.7797712087631226, "learning_rate": 3.1856658792510485e-06, "loss": 0.4786, "step": 38300 }, { "epoch": 0.42443601958595384, "grad_norm": 2.1778018474578857, "learning_rate": 3.177143128807565e-06, "loss": 0.4695, "step": 38400 }, { "epoch": 0.42554132172029224, "grad_norm": 2.2871477603912354, "learning_rate": 3.168611877068302e-06, "loss": 0.4766, "step": 38500 }, { "epoch": 0.42664662385463065, "grad_norm": 3.016216993331909, "learning_rate": 3.1600722311401515e-06, "loss": 0.4544, "step": 38600 }, { "epoch": 0.4277519259889691, "grad_norm": 1.759264349937439, "learning_rate": 3.1515242982353876e-06, "loss": 0.4414, "step": 38700 }, { "epoch": 0.4288572281233075, "grad_norm": 2.0453083515167236, "learning_rate": 3.1429681856703287e-06, "loss": 0.4471, "step": 38800 }, { "epoch": 0.4299625302576459, "grad_norm": 1.5130780935287476, "learning_rate": 3.1344040008639797e-06, "loss": 0.4469, "step": 38900 }, { "epoch": 0.4310678323919844, "grad_norm": 1.812267541885376, "learning_rate": 3.1258318513366975e-06, "loss": 0.4754, "step": 39000 }, { "epoch": 0.4321731345263228, "grad_norm": 1.798132300376892, "learning_rate": 3.1172518447088264e-06, "loss": 0.4519, "step": 39100 }, { "epoch": 0.4332784366606612, "grad_norm": 2.252378463745117, "learning_rate": 3.108664088699358e-06, "loss": 0.4622, "step": 39200 }, { "epoch": 0.4343837387949996, "grad_norm": 1.2119619846343994, "learning_rate": 3.100068691124572e-06, "loss": 0.4541, "step": 39300 }, { "epoch": 0.43548904092933804, "grad_norm": 1.4428755044937134, "learning_rate": 3.091465759896688e-06, "loss": 0.4731, "step": 39400 }, { "epoch": 0.43659434306367645, "grad_norm": 1.7551451921463013, "learning_rate": 3.082855403022507e-06, "loss": 0.441, "step": 39500 }, { "epoch": 0.43769964519801485, "grad_norm": 1.55975341796875, "learning_rate": 3.0742377286020547e-06, "loss": 0.4249, "step": 39600 }, { "epoch": 0.4388049473323533, "grad_norm": 1.1946512460708618, "learning_rate": 3.0656128448272284e-06, "loss": 0.4709, "step": 39700 }, { "epoch": 0.4399102494666917, "grad_norm": 1.1257880926132202, "learning_rate": 3.0569808599804345e-06, "loss": 0.4307, "step": 39800 }, { "epoch": 0.4410155516010301, "grad_norm": 1.8002004623413086, "learning_rate": 3.048341882433232e-06, "loss": 0.4612, "step": 39900 }, { "epoch": 0.4421208537353686, "grad_norm": 2.031006097793579, "learning_rate": 3.039696020644972e-06, "loss": 0.4554, "step": 40000 }, { "epoch": 0.443226155869707, "grad_norm": 2.301436185836792, "learning_rate": 3.0310433831614307e-06, "loss": 0.4387, "step": 40100 }, { "epoch": 0.4443314580040454, "grad_norm": 1.4582908153533936, "learning_rate": 3.0223840786134553e-06, "loss": 0.455, "step": 40200 }, { "epoch": 0.44543676013838385, "grad_norm": 2.0824360847473145, "learning_rate": 3.013718215715593e-06, "loss": 0.4828, "step": 40300 }, { "epoch": 0.44654206227272225, "grad_norm": 2.2939536571502686, "learning_rate": 3.0050459032647306e-06, "loss": 0.457, "step": 40400 }, { "epoch": 0.44764736440706066, "grad_norm": 2.297245979309082, "learning_rate": 2.9963672501387247e-06, "loss": 0.4778, "step": 40500 }, { "epoch": 0.4487526665413991, "grad_norm": 1.8728293180465698, "learning_rate": 2.987682365295038e-06, "loss": 0.4448, "step": 40600 }, { "epoch": 0.4498579686757375, "grad_norm": 1.5255945920944214, "learning_rate": 2.978991357769371e-06, "loss": 0.4472, "step": 40700 }, { "epoch": 0.4509632708100759, "grad_norm": 2.7456576824188232, "learning_rate": 2.9702943366742915e-06, "loss": 0.4668, "step": 40800 }, { "epoch": 0.4520685729444144, "grad_norm": 2.2749907970428467, "learning_rate": 2.961591411197865e-06, "loss": 0.4483, "step": 40900 }, { "epoch": 0.4531738750787528, "grad_norm": 2.1402695178985596, "learning_rate": 2.9528826906022843e-06, "loss": 0.4487, "step": 41000 }, { "epoch": 0.4542791772130912, "grad_norm": 2.3826072216033936, "learning_rate": 2.944168284222502e-06, "loss": 0.4953, "step": 41100 }, { "epoch": 0.4553844793474296, "grad_norm": 2.2698001861572266, "learning_rate": 2.9354483014648463e-06, "loss": 0.484, "step": 41200 }, { "epoch": 0.45648978148176805, "grad_norm": 1.9907783269882202, "learning_rate": 2.926722851805661e-06, "loss": 0.4398, "step": 41300 }, { "epoch": 0.45759508361610646, "grad_norm": 1.5543720722198486, "learning_rate": 2.917992044789923e-06, "loss": 0.4363, "step": 41400 }, { "epoch": 0.45870038575044486, "grad_norm": 1.8793258666992188, "learning_rate": 2.909255990029869e-06, "loss": 0.4567, "step": 41500 }, { "epoch": 0.4598056878847833, "grad_norm": 2.4277260303497314, "learning_rate": 2.900514797203617e-06, "loss": 0.4491, "step": 41600 }, { "epoch": 0.4609109900191217, "grad_norm": 2.2503464221954346, "learning_rate": 2.891768576053797e-06, "loss": 0.4804, "step": 41700 }, { "epoch": 0.46201629215346013, "grad_norm": 1.4896454811096191, "learning_rate": 2.8830174363861635e-06, "loss": 0.4403, "step": 41800 }, { "epoch": 0.4631215942877986, "grad_norm": 2.510836601257324, "learning_rate": 2.874261488068221e-06, "loss": 0.451, "step": 41900 }, { "epoch": 0.464226896422137, "grad_norm": 1.5463513135910034, "learning_rate": 2.8655008410278482e-06, "loss": 0.4671, "step": 42000 }, { "epoch": 0.4653321985564754, "grad_norm": 2.300896167755127, "learning_rate": 2.856735605251912e-06, "loss": 0.4348, "step": 42100 }, { "epoch": 0.46643750069081386, "grad_norm": 2.3069446086883545, "learning_rate": 2.8479658907848893e-06, "loss": 0.4478, "step": 42200 }, { "epoch": 0.46754280282515226, "grad_norm": 2.1205623149871826, "learning_rate": 2.8391918077274873e-06, "loss": 0.4346, "step": 42300 }, { "epoch": 0.46864810495949066, "grad_norm": 1.638277292251587, "learning_rate": 2.830413466235258e-06, "loss": 0.4395, "step": 42400 }, { "epoch": 0.4697534070938291, "grad_norm": 2.0386252403259277, "learning_rate": 2.8216309765172156e-06, "loss": 0.4421, "step": 42500 }, { "epoch": 0.4708587092281675, "grad_norm": 2.241922378540039, "learning_rate": 2.8128444488344565e-06, "loss": 0.4518, "step": 42600 }, { "epoch": 0.47196401136250593, "grad_norm": 2.304940938949585, "learning_rate": 2.8040539934987697e-06, "loss": 0.4803, "step": 42700 }, { "epoch": 0.47306931349684433, "grad_norm": 2.377882480621338, "learning_rate": 2.795259720871256e-06, "loss": 0.4637, "step": 42800 }, { "epoch": 0.4741746156311828, "grad_norm": 1.9520049095153809, "learning_rate": 2.7864617413609414e-06, "loss": 0.452, "step": 42900 }, { "epoch": 0.4752799177655212, "grad_norm": 2.1737561225891113, "learning_rate": 2.777660165423388e-06, "loss": 0.4622, "step": 43000 }, { "epoch": 0.4763852198998596, "grad_norm": 1.6113853454589844, "learning_rate": 2.7688551035593125e-06, "loss": 0.448, "step": 43100 }, { "epoch": 0.47749052203419806, "grad_norm": 2.39670729637146, "learning_rate": 2.760046666313196e-06, "loss": 0.4512, "step": 43200 }, { "epoch": 0.47859582416853647, "grad_norm": 1.8168816566467285, "learning_rate": 2.7512349642718927e-06, "loss": 0.4712, "step": 43300 }, { "epoch": 0.47970112630287487, "grad_norm": 1.6397266387939453, "learning_rate": 2.7424201080632516e-06, "loss": 0.4569, "step": 43400 }, { "epoch": 0.48080642843721333, "grad_norm": 2.2524404525756836, "learning_rate": 2.7336022083547153e-06, "loss": 0.4882, "step": 43500 }, { "epoch": 0.48191173057155173, "grad_norm": 2.5701520442962646, "learning_rate": 2.72478137585194e-06, "loss": 0.4593, "step": 43600 }, { "epoch": 0.48301703270589014, "grad_norm": 1.691336989402771, "learning_rate": 2.7159577212973985e-06, "loss": 0.4743, "step": 43700 }, { "epoch": 0.4841223348402286, "grad_norm": 1.9625279903411865, "learning_rate": 2.7071313554689994e-06, "loss": 0.4834, "step": 43800 }, { "epoch": 0.485227636974567, "grad_norm": 1.4627450704574585, "learning_rate": 2.6983023891786835e-06, "loss": 0.4513, "step": 43900 }, { "epoch": 0.4863329391089054, "grad_norm": 2.0734519958496094, "learning_rate": 2.689470933271045e-06, "loss": 0.4611, "step": 44000 }, { "epoch": 0.48743824124324386, "grad_norm": 1.5627169609069824, "learning_rate": 2.6806370986219305e-06, "loss": 0.445, "step": 44100 }, { "epoch": 0.48854354337758227, "grad_norm": 2.4556682109832764, "learning_rate": 2.6718009961370544e-06, "loss": 0.4255, "step": 44200 }, { "epoch": 0.48964884551192067, "grad_norm": 1.817841649055481, "learning_rate": 2.6629627367505996e-06, "loss": 0.4725, "step": 44300 }, { "epoch": 0.49075414764625913, "grad_norm": 2.1898646354675293, "learning_rate": 2.6541224314238306e-06, "loss": 0.4321, "step": 44400 }, { "epoch": 0.49185944978059754, "grad_norm": 1.9783952236175537, "learning_rate": 2.645280191143697e-06, "loss": 0.473, "step": 44500 }, { "epoch": 0.49296475191493594, "grad_norm": 2.2066643238067627, "learning_rate": 2.6364361269214404e-06, "loss": 0.4388, "step": 44600 }, { "epoch": 0.49407005404927434, "grad_norm": 1.5500693321228027, "learning_rate": 2.627590349791203e-06, "loss": 0.4515, "step": 44700 }, { "epoch": 0.4951753561836128, "grad_norm": 1.9073359966278076, "learning_rate": 2.6187429708086304e-06, "loss": 0.4475, "step": 44800 }, { "epoch": 0.4962806583179512, "grad_norm": 1.692548394203186, "learning_rate": 2.6098941010494793e-06, "loss": 0.4116, "step": 44900 }, { "epoch": 0.4973859604522896, "grad_norm": 1.8653684854507446, "learning_rate": 2.6010438516082244e-06, "loss": 0.4462, "step": 45000 }, { "epoch": 0.49849126258662807, "grad_norm": 2.772581100463867, "learning_rate": 2.592192333596658e-06, "loss": 0.4465, "step": 45100 }, { "epoch": 0.4995965647209665, "grad_norm": 1.9330416917800903, "learning_rate": 2.583339658142503e-06, "loss": 0.4693, "step": 45200 }, { "epoch": 0.5007018668553049, "grad_norm": 1.846220850944519, "learning_rate": 2.574485936388011e-06, "loss": 0.4782, "step": 45300 }, { "epoch": 0.5018071689896433, "grad_norm": 1.9324105978012085, "learning_rate": 2.5656312794885696e-06, "loss": 0.476, "step": 45400 }, { "epoch": 0.5029124711239817, "grad_norm": 1.4215826988220215, "learning_rate": 2.5567757986113082e-06, "loss": 0.4404, "step": 45500 }, { "epoch": 0.5040177732583202, "grad_norm": 2.124636173248291, "learning_rate": 2.5479196049336994e-06, "loss": 0.4685, "step": 45600 }, { "epoch": 0.5051230753926585, "grad_norm": 2.1870932579040527, "learning_rate": 2.5390628096421675e-06, "loss": 0.4384, "step": 45700 }, { "epoch": 0.506228377526997, "grad_norm": 2.281766891479492, "learning_rate": 2.5302055239306857e-06, "loss": 0.4849, "step": 45800 }, { "epoch": 0.5073336796613355, "grad_norm": 2.991182804107666, "learning_rate": 2.5213478589993884e-06, "loss": 0.4585, "step": 45900 }, { "epoch": 0.5084389817956738, "grad_norm": 2.271472930908203, "learning_rate": 2.5124899260531667e-06, "loss": 0.4459, "step": 46000 }, { "epoch": 0.5095442839300123, "grad_norm": 1.7806503772735596, "learning_rate": 2.5036318363002816e-06, "loss": 0.4448, "step": 46100 }, { "epoch": 0.5106495860643507, "grad_norm": 2.3559248447418213, "learning_rate": 2.4947737009509577e-06, "loss": 0.4468, "step": 46200 }, { "epoch": 0.5117548881986891, "grad_norm": 2.1456425189971924, "learning_rate": 2.4859156312159945e-06, "loss": 0.4304, "step": 46300 }, { "epoch": 0.5128601903330275, "grad_norm": 2.4595870971679688, "learning_rate": 2.4770577383053695e-06, "loss": 0.4756, "step": 46400 }, { "epoch": 0.513965492467366, "grad_norm": 1.6186550855636597, "learning_rate": 2.4682001334268376e-06, "loss": 0.4246, "step": 46500 }, { "epoch": 0.5150707946017044, "grad_norm": 2.1293444633483887, "learning_rate": 2.4593429277845366e-06, "loss": 0.4373, "step": 46600 }, { "epoch": 0.5161760967360428, "grad_norm": 2.4468750953674316, "learning_rate": 2.450486232577596e-06, "loss": 0.4722, "step": 46700 }, { "epoch": 0.5172813988703813, "grad_norm": 1.3718825578689575, "learning_rate": 2.441630158998734e-06, "loss": 0.4625, "step": 46800 }, { "epoch": 0.5183867010047196, "grad_norm": 1.7043936252593994, "learning_rate": 2.432774818232865e-06, "loss": 0.4889, "step": 46900 }, { "epoch": 0.5194920031390581, "grad_norm": 1.942793607711792, "learning_rate": 2.4239203214557026e-06, "loss": 0.4539, "step": 47000 }, { "epoch": 0.5205973052733965, "grad_norm": 2.086621046066284, "learning_rate": 2.4150667798323664e-06, "loss": 0.4303, "step": 47100 }, { "epoch": 0.5217026074077349, "grad_norm": 2.2322304248809814, "learning_rate": 2.406214304515982e-06, "loss": 0.4616, "step": 47200 }, { "epoch": 0.5228079095420733, "grad_norm": 1.703951120376587, "learning_rate": 2.3973630066462895e-06, "loss": 0.4479, "step": 47300 }, { "epoch": 0.5239132116764117, "grad_norm": 1.6014420986175537, "learning_rate": 2.3885129973482475e-06, "loss": 0.4269, "step": 47400 }, { "epoch": 0.5250185138107502, "grad_norm": 2.385668992996216, "learning_rate": 2.379664387730634e-06, "loss": 0.4284, "step": 47500 }, { "epoch": 0.5261238159450886, "grad_norm": 2.08682918548584, "learning_rate": 2.370817288884656e-06, "loss": 0.4573, "step": 47600 }, { "epoch": 0.527229118079427, "grad_norm": 1.9396214485168457, "learning_rate": 2.3619718118825536e-06, "loss": 0.4701, "step": 47700 }, { "epoch": 0.5283344202137654, "grad_norm": 1.9038134813308716, "learning_rate": 2.3531280677762064e-06, "loss": 0.4437, "step": 47800 }, { "epoch": 0.5294397223481039, "grad_norm": 2.4148266315460205, "learning_rate": 2.3442861675957353e-06, "loss": 0.4264, "step": 47900 }, { "epoch": 0.5305450244824422, "grad_norm": 2.0972328186035156, "learning_rate": 2.3354462223481126e-06, "loss": 0.4461, "step": 48000 }, { "epoch": 0.5316503266167807, "grad_norm": 2.8991668224334717, "learning_rate": 2.326608343015769e-06, "loss": 0.4461, "step": 48100 }, { "epoch": 0.5327556287511191, "grad_norm": 1.24418306350708, "learning_rate": 2.3177726405551953e-06, "loss": 0.4329, "step": 48200 }, { "epoch": 0.5338609308854575, "grad_norm": 1.501638650894165, "learning_rate": 2.308939225895554e-06, "loss": 0.4252, "step": 48300 }, { "epoch": 0.534966233019796, "grad_norm": 1.7708169221878052, "learning_rate": 2.300108209937284e-06, "loss": 0.4492, "step": 48400 }, { "epoch": 0.5360715351541344, "grad_norm": 1.757341980934143, "learning_rate": 2.2912797035507118e-06, "loss": 0.4342, "step": 48500 }, { "epoch": 0.5371768372884728, "grad_norm": 1.7680574655532837, "learning_rate": 2.2824538175746554e-06, "loss": 0.4524, "step": 48600 }, { "epoch": 0.5382821394228112, "grad_norm": 2.0074987411499023, "learning_rate": 2.2736306628150322e-06, "loss": 0.436, "step": 48700 }, { "epoch": 0.5393874415571497, "grad_norm": 1.9048947095870972, "learning_rate": 2.2648103500434756e-06, "loss": 0.4189, "step": 48800 }, { "epoch": 0.540492743691488, "grad_norm": 2.519080638885498, "learning_rate": 2.255992989995934e-06, "loss": 0.4251, "step": 48900 }, { "epoch": 0.5415980458258265, "grad_norm": 2.2120232582092285, "learning_rate": 2.247178693371288e-06, "loss": 0.4933, "step": 49000 }, { "epoch": 0.542703347960165, "grad_norm": 1.7563016414642334, "learning_rate": 2.238367570829954e-06, "loss": 0.4602, "step": 49100 }, { "epoch": 0.5438086500945033, "grad_norm": 1.5373327732086182, "learning_rate": 2.229559732992507e-06, "loss": 0.4792, "step": 49200 }, { "epoch": 0.5449139522288418, "grad_norm": 2.573272228240967, "learning_rate": 2.220755290438275e-06, "loss": 0.4659, "step": 49300 }, { "epoch": 0.5460192543631802, "grad_norm": 1.7102992534637451, "learning_rate": 2.211954353703965e-06, "loss": 0.4553, "step": 49400 }, { "epoch": 0.5471245564975186, "grad_norm": 2.3353729248046875, "learning_rate": 2.203157033282265e-06, "loss": 0.4307, "step": 49500 }, { "epoch": 0.548229858631857, "grad_norm": 1.7641658782958984, "learning_rate": 2.194363439620468e-06, "loss": 0.4648, "step": 49600 }, { "epoch": 0.5493351607661955, "grad_norm": 1.2468318939208984, "learning_rate": 2.1855736831190723e-06, "loss": 0.4616, "step": 49700 }, { "epoch": 0.5504404629005338, "grad_norm": 2.137446880340576, "learning_rate": 2.1767878741304044e-06, "loss": 0.4671, "step": 49800 }, { "epoch": 0.5515457650348723, "grad_norm": 2.4773776531219482, "learning_rate": 2.1680061229572343e-06, "loss": 0.4737, "step": 49900 }, { "epoch": 0.5526510671692108, "grad_norm": 2.0055341720581055, "learning_rate": 2.1592285398513815e-06, "loss": 0.4533, "step": 50000 }, { "epoch": 0.5537563693035491, "grad_norm": 1.876347303390503, "learning_rate": 2.150455235012342e-06, "loss": 0.4208, "step": 50100 }, { "epoch": 0.5548616714378876, "grad_norm": 2.5351920127868652, "learning_rate": 2.1416863185858964e-06, "loss": 0.4404, "step": 50200 }, { "epoch": 0.555966973572226, "grad_norm": 1.0931345224380493, "learning_rate": 2.132921900662733e-06, "loss": 0.4465, "step": 50300 }, { "epoch": 0.5570722757065644, "grad_norm": 2.0798308849334717, "learning_rate": 2.1241620912770612e-06, "loss": 0.4152, "step": 50400 }, { "epoch": 0.5581775778409028, "grad_norm": 1.6538605690002441, "learning_rate": 2.115407000405231e-06, "loss": 0.4209, "step": 50500 }, { "epoch": 0.5592828799752413, "grad_norm": 2.1094820499420166, "learning_rate": 2.1066567379643557e-06, "loss": 0.4367, "step": 50600 }, { "epoch": 0.5603881821095796, "grad_norm": 2.1819286346435547, "learning_rate": 2.097911413810928e-06, "loss": 0.4525, "step": 50700 }, { "epoch": 0.5614934842439181, "grad_norm": 2.0643765926361084, "learning_rate": 2.089171137739441e-06, "loss": 0.4504, "step": 50800 }, { "epoch": 0.5625987863782564, "grad_norm": 1.5290354490280151, "learning_rate": 2.0804360194810117e-06, "loss": 0.4313, "step": 50900 }, { "epoch": 0.5637040885125949, "grad_norm": 1.9766910076141357, "learning_rate": 2.0717061687020047e-06, "loss": 0.4177, "step": 51000 }, { "epoch": 0.5648093906469334, "grad_norm": 1.1951794624328613, "learning_rate": 2.0629816950026505e-06, "loss": 0.5075, "step": 51100 }, { "epoch": 0.5659146927812717, "grad_norm": 2.3847384452819824, "learning_rate": 2.054262707915671e-06, "loss": 0.4196, "step": 51200 }, { "epoch": 0.5670199949156102, "grad_norm": 1.665724515914917, "learning_rate": 2.0455493169049115e-06, "loss": 0.4333, "step": 51300 }, { "epoch": 0.5681252970499486, "grad_norm": 1.6288607120513916, "learning_rate": 2.036841631363954e-06, "loss": 0.4853, "step": 51400 }, { "epoch": 0.569230599184287, "grad_norm": 2.2280824184417725, "learning_rate": 2.028139760614754e-06, "loss": 0.456, "step": 51500 }, { "epoch": 0.5703359013186254, "grad_norm": 2.9321858882904053, "learning_rate": 2.019443813906262e-06, "loss": 0.4694, "step": 51600 }, { "epoch": 0.5714412034529639, "grad_norm": 2.381856918334961, "learning_rate": 2.0107539004130577e-06, "loss": 0.4679, "step": 51700 }, { "epoch": 0.5725465055873022, "grad_norm": 2.0987162590026855, "learning_rate": 2.002070129233972e-06, "loss": 0.4611, "step": 51800 }, { "epoch": 0.5736518077216407, "grad_norm": 2.339217185974121, "learning_rate": 1.993392609390723e-06, "loss": 0.5007, "step": 51900 }, { "epoch": 0.5747571098559792, "grad_norm": 1.3680297136306763, "learning_rate": 1.984721449826547e-06, "loss": 0.4823, "step": 52000 }, { "epoch": 0.5758624119903175, "grad_norm": 1.494996190071106, "learning_rate": 1.976056759404827e-06, "loss": 0.4528, "step": 52100 }, { "epoch": 0.576967714124656, "grad_norm": 2.1765034198760986, "learning_rate": 1.967398646907728e-06, "loss": 0.4476, "step": 52200 }, { "epoch": 0.5780730162589944, "grad_norm": 1.8729513883590698, "learning_rate": 1.9587472210348318e-06, "loss": 0.4626, "step": 52300 }, { "epoch": 0.5791783183933328, "grad_norm": 1.8249151706695557, "learning_rate": 1.950102590401774e-06, "loss": 0.4488, "step": 52400 }, { "epoch": 0.5802836205276712, "grad_norm": 1.604670763015747, "learning_rate": 1.9414648635388765e-06, "loss": 0.4385, "step": 52500 }, { "epoch": 0.5813889226620097, "grad_norm": 1.7172939777374268, "learning_rate": 1.932834148889785e-06, "loss": 0.452, "step": 52600 }, { "epoch": 0.582494224796348, "grad_norm": 2.7707228660583496, "learning_rate": 1.924210554810114e-06, "loss": 0.4213, "step": 52700 }, { "epoch": 0.5835995269306865, "grad_norm": 1.858169436454773, "learning_rate": 1.9155941895660775e-06, "loss": 0.4422, "step": 52800 }, { "epoch": 0.584704829065025, "grad_norm": 2.9702155590057373, "learning_rate": 1.9069851613331363e-06, "loss": 0.4903, "step": 52900 }, { "epoch": 0.5858101311993633, "grad_norm": 1.5274828672409058, "learning_rate": 1.8983835781946355e-06, "loss": 0.4359, "step": 53000 }, { "epoch": 0.5869154333337018, "grad_norm": 1.5798296928405762, "learning_rate": 1.8897895481404523e-06, "loss": 0.4666, "step": 53100 }, { "epoch": 0.5880207354680402, "grad_norm": 2.6816885471343994, "learning_rate": 1.8812031790656365e-06, "loss": 0.4603, "step": 53200 }, { "epoch": 0.5891260376023786, "grad_norm": 2.24021577835083, "learning_rate": 1.8726245787690556e-06, "loss": 0.4434, "step": 53300 }, { "epoch": 0.590231339736717, "grad_norm": 2.0478105545043945, "learning_rate": 1.8640538549520432e-06, "loss": 0.4547, "step": 53400 }, { "epoch": 0.5913366418710555, "grad_norm": 2.7488420009613037, "learning_rate": 1.8554911152170491e-06, "loss": 0.401, "step": 53500 }, { "epoch": 0.5924419440053939, "grad_norm": 1.8583904504776, "learning_rate": 1.8469364670662838e-06, "loss": 0.4229, "step": 53600 }, { "epoch": 0.5935472461397323, "grad_norm": 2.7477619647979736, "learning_rate": 1.8383900179003678e-06, "loss": 0.453, "step": 53700 }, { "epoch": 0.5946525482740708, "grad_norm": 2.0758025646209717, "learning_rate": 1.829851875016993e-06, "loss": 0.4528, "step": 53800 }, { "epoch": 0.5957578504084091, "grad_norm": 1.2921638488769531, "learning_rate": 1.8213221456095626e-06, "loss": 0.45, "step": 53900 }, { "epoch": 0.5968631525427476, "grad_norm": 1.9033405780792236, "learning_rate": 1.812800936765855e-06, "loss": 0.4489, "step": 54000 }, { "epoch": 0.597968454677086, "grad_norm": 2.5085136890411377, "learning_rate": 1.8042883554666733e-06, "loss": 0.4501, "step": 54100 }, { "epoch": 0.5990737568114244, "grad_norm": 1.3407922983169556, "learning_rate": 1.7957845085845086e-06, "loss": 0.4581, "step": 54200 }, { "epoch": 0.6001790589457628, "grad_norm": 1.598039150238037, "learning_rate": 1.7872895028821902e-06, "loss": 0.4406, "step": 54300 }, { "epoch": 0.6012843610801012, "grad_norm": 1.5193266868591309, "learning_rate": 1.7788034450115522e-06, "loss": 0.4412, "step": 54400 }, { "epoch": 0.6023896632144397, "grad_norm": 2.39776611328125, "learning_rate": 1.7703264415120912e-06, "loss": 0.4446, "step": 54500 }, { "epoch": 0.6034949653487781, "grad_norm": 2.233445167541504, "learning_rate": 1.7618585988096292e-06, "loss": 0.4512, "step": 54600 }, { "epoch": 0.6046002674831165, "grad_norm": 3.316636323928833, "learning_rate": 1.7534000232149772e-06, "loss": 0.4617, "step": 54700 }, { "epoch": 0.6057055696174549, "grad_norm": 1.9188458919525146, "learning_rate": 1.7449508209226007e-06, "loss": 0.4551, "step": 54800 }, { "epoch": 0.6068108717517934, "grad_norm": 2.422166109085083, "learning_rate": 1.7365110980092886e-06, "loss": 0.4213, "step": 54900 }, { "epoch": 0.6079161738861317, "grad_norm": 1.886583685874939, "learning_rate": 1.7280809604328175e-06, "loss": 0.4424, "step": 55000 }, { "epoch": 0.6090214760204702, "grad_norm": 2.0250625610351562, "learning_rate": 1.7196605140306227e-06, "loss": 0.4474, "step": 55100 }, { "epoch": 0.6101267781548086, "grad_norm": 1.9184309244155884, "learning_rate": 1.7112498645184734e-06, "loss": 0.4483, "step": 55200 }, { "epoch": 0.611232080289147, "grad_norm": 1.7985000610351562, "learning_rate": 1.7028491174891395e-06, "loss": 0.4395, "step": 55300 }, { "epoch": 0.6123373824234855, "grad_norm": 2.2696986198425293, "learning_rate": 1.6944583784110702e-06, "loss": 0.46, "step": 55400 }, { "epoch": 0.6134426845578239, "grad_norm": 1.9761462211608887, "learning_rate": 1.6860777526270663e-06, "loss": 0.4514, "step": 55500 }, { "epoch": 0.6145479866921623, "grad_norm": 1.6298624277114868, "learning_rate": 1.6777073453529628e-06, "loss": 0.4339, "step": 55600 }, { "epoch": 0.6156532888265007, "grad_norm": 1.7984713315963745, "learning_rate": 1.6693472616763023e-06, "loss": 0.4364, "step": 55700 }, { "epoch": 0.6167585909608392, "grad_norm": 2.747307777404785, "learning_rate": 1.6609976065550188e-06, "loss": 0.4817, "step": 55800 }, { "epoch": 0.6178638930951775, "grad_norm": 2.802546739578247, "learning_rate": 1.6526584848161214e-06, "loss": 0.4566, "step": 55900 }, { "epoch": 0.618969195229516, "grad_norm": 1.783996820449829, "learning_rate": 1.644330001154373e-06, "loss": 0.4595, "step": 56000 }, { "epoch": 0.6200744973638544, "grad_norm": 2.170027494430542, "learning_rate": 1.6360122601309819e-06, "loss": 0.4608, "step": 56100 }, { "epoch": 0.6211797994981928, "grad_norm": 1.9390249252319336, "learning_rate": 1.6277053661722836e-06, "loss": 0.4632, "step": 56200 }, { "epoch": 0.6222851016325313, "grad_norm": 1.528578281402588, "learning_rate": 1.6194094235684363e-06, "loss": 0.4299, "step": 56300 }, { "epoch": 0.6233904037668697, "grad_norm": 2.1283223628997803, "learning_rate": 1.611124536472104e-06, "loss": 0.4758, "step": 56400 }, { "epoch": 0.6244957059012081, "grad_norm": 1.7181930541992188, "learning_rate": 1.6028508088971542e-06, "loss": 0.4408, "step": 56500 }, { "epoch": 0.6256010080355465, "grad_norm": 1.5925639867782593, "learning_rate": 1.5945883447173516e-06, "loss": 0.4125, "step": 56600 }, { "epoch": 0.626706310169885, "grad_norm": 2.1560404300689697, "learning_rate": 1.5863372476650518e-06, "loss": 0.4572, "step": 56700 }, { "epoch": 0.6278116123042233, "grad_norm": 1.5837538242340088, "learning_rate": 1.5780976213298987e-06, "loss": 0.4234, "step": 56800 }, { "epoch": 0.6289169144385618, "grad_norm": 1.7496099472045898, "learning_rate": 1.5698695691575278e-06, "loss": 0.4622, "step": 56900 }, { "epoch": 0.6300222165729003, "grad_norm": 1.950454592704773, "learning_rate": 1.5616531944482639e-06, "loss": 0.46, "step": 57000 }, { "epoch": 0.6311275187072386, "grad_norm": 1.873214840888977, "learning_rate": 1.5534486003558256e-06, "loss": 0.4349, "step": 57100 }, { "epoch": 0.6322328208415771, "grad_norm": 2.442535877227783, "learning_rate": 1.5452558898860289e-06, "loss": 0.4525, "step": 57200 }, { "epoch": 0.6333381229759155, "grad_norm": 2.4935104846954346, "learning_rate": 1.5370751658954962e-06, "loss": 0.4348, "step": 57300 }, { "epoch": 0.6344434251102539, "grad_norm": 2.2208077907562256, "learning_rate": 1.5289065310903642e-06, "loss": 0.4525, "step": 57400 }, { "epoch": 0.6355487272445923, "grad_norm": 2.645033121109009, "learning_rate": 1.5207500880249937e-06, "loss": 0.4303, "step": 57500 }, { "epoch": 0.6366540293789308, "grad_norm": 2.4756534099578857, "learning_rate": 1.5126059391006806e-06, "loss": 0.4273, "step": 57600 }, { "epoch": 0.6377593315132691, "grad_norm": 2.156022548675537, "learning_rate": 1.5044741865643752e-06, "loss": 0.4363, "step": 57700 }, { "epoch": 0.6388646336476076, "grad_norm": 1.1067718267440796, "learning_rate": 1.4963549325073937e-06, "loss": 0.477, "step": 57800 }, { "epoch": 0.6399699357819459, "grad_norm": 2.1002750396728516, "learning_rate": 1.488248278864139e-06, "loss": 0.4241, "step": 57900 }, { "epoch": 0.6410752379162844, "grad_norm": 2.1461567878723145, "learning_rate": 1.4801543274108182e-06, "loss": 0.461, "step": 58000 }, { "epoch": 0.6421805400506229, "grad_norm": 1.992863655090332, "learning_rate": 1.4720731797641701e-06, "loss": 0.4419, "step": 58100 }, { "epoch": 0.6432858421849612, "grad_norm": 1.8167692422866821, "learning_rate": 1.464004937380184e-06, "loss": 0.4239, "step": 58200 }, { "epoch": 0.6443911443192997, "grad_norm": 1.0601933002471924, "learning_rate": 1.4559497015528278e-06, "loss": 0.4534, "step": 58300 }, { "epoch": 0.6454964464536381, "grad_norm": 1.5626897811889648, "learning_rate": 1.4479075734127795e-06, "loss": 0.4109, "step": 58400 }, { "epoch": 0.6466017485879765, "grad_norm": 2.2622973918914795, "learning_rate": 1.4398786539261515e-06, "loss": 0.4546, "step": 58500 }, { "epoch": 0.6477070507223149, "grad_norm": 2.4710042476654053, "learning_rate": 1.4318630438932258e-06, "loss": 0.4442, "step": 58600 }, { "epoch": 0.6488123528566534, "grad_norm": 2.6686673164367676, "learning_rate": 1.4238608439471916e-06, "loss": 0.442, "step": 58700 }, { "epoch": 0.6499176549909917, "grad_norm": 1.9529846906661987, "learning_rate": 1.4158721545528786e-06, "loss": 0.4719, "step": 58800 }, { "epoch": 0.6510229571253302, "grad_norm": 1.6578528881072998, "learning_rate": 1.4078970760054952e-06, "loss": 0.4729, "step": 58900 }, { "epoch": 0.6521282592596687, "grad_norm": 1.7940270900726318, "learning_rate": 1.399935708429368e-06, "loss": 0.4512, "step": 59000 }, { "epoch": 0.653233561394007, "grad_norm": 1.85922372341156, "learning_rate": 1.3919881517766941e-06, "loss": 0.4402, "step": 59100 }, { "epoch": 0.6543388635283455, "grad_norm": 2.1098904609680176, "learning_rate": 1.3840545058262729e-06, "loss": 0.4497, "step": 59200 }, { "epoch": 0.6554441656626839, "grad_norm": 1.5995895862579346, "learning_rate": 1.376134870182262e-06, "loss": 0.4626, "step": 59300 }, { "epoch": 0.6565494677970223, "grad_norm": 1.8691281080245972, "learning_rate": 1.3682293442729217e-06, "loss": 0.4674, "step": 59400 }, { "epoch": 0.6576547699313607, "grad_norm": 2.0507023334503174, "learning_rate": 1.3603380273493769e-06, "loss": 0.4547, "step": 59500 }, { "epoch": 0.6587600720656992, "grad_norm": 1.5811275243759155, "learning_rate": 1.3524610184843567e-06, "loss": 0.4523, "step": 59600 }, { "epoch": 0.6598653742000375, "grad_norm": 1.8390048742294312, "learning_rate": 1.3445984165709586e-06, "loss": 0.436, "step": 59700 }, { "epoch": 0.660970676334376, "grad_norm": 2.165388345718384, "learning_rate": 1.3367503203214078e-06, "loss": 0.4259, "step": 59800 }, { "epoch": 0.6620759784687145, "grad_norm": 1.9885059595108032, "learning_rate": 1.3289168282658167e-06, "loss": 0.4394, "step": 59900 }, { "epoch": 0.6631812806030528, "grad_norm": 0.8709326386451721, "learning_rate": 1.3210980387509436e-06, "loss": 0.4507, "step": 60000 }, { "epoch": 0.6642865827373913, "grad_norm": 1.6904494762420654, "learning_rate": 1.3132940499389634e-06, "loss": 0.4469, "step": 60100 }, { "epoch": 0.6653918848717297, "grad_norm": 2.0872297286987305, "learning_rate": 1.3055049598062347e-06, "loss": 0.4256, "step": 60200 }, { "epoch": 0.6664971870060681, "grad_norm": 3.0559935569763184, "learning_rate": 1.2977308661420657e-06, "loss": 0.5023, "step": 60300 }, { "epoch": 0.6676024891404065, "grad_norm": 1.9940212965011597, "learning_rate": 1.2899718665474913e-06, "loss": 0.4416, "step": 60400 }, { "epoch": 0.668707791274745, "grad_norm": 1.7937722206115723, "learning_rate": 1.2822280584340458e-06, "loss": 0.4676, "step": 60500 }, { "epoch": 0.6698130934090833, "grad_norm": 3.7665975093841553, "learning_rate": 1.2744995390225378e-06, "loss": 0.4159, "step": 60600 }, { "epoch": 0.6709183955434218, "grad_norm": 2.6829941272735596, "learning_rate": 1.2667864053418316e-06, "loss": 0.4499, "step": 60700 }, { "epoch": 0.6720236976777603, "grad_norm": 3.8452253341674805, "learning_rate": 1.2590887542276314e-06, "loss": 0.4391, "step": 60800 }, { "epoch": 0.6731289998120986, "grad_norm": 2.4866082668304443, "learning_rate": 1.2514066823212623e-06, "loss": 0.4567, "step": 60900 }, { "epoch": 0.6742343019464371, "grad_norm": 1.9398912191390991, "learning_rate": 1.2437402860684566e-06, "loss": 0.479, "step": 61000 }, { "epoch": 0.6753396040807755, "grad_norm": 2.085367202758789, "learning_rate": 1.2360896617181442e-06, "loss": 0.441, "step": 61100 }, { "epoch": 0.6764449062151139, "grad_norm": 1.9988934993743896, "learning_rate": 1.2284549053212461e-06, "loss": 0.4435, "step": 61200 }, { "epoch": 0.6775502083494523, "grad_norm": 1.8229702711105347, "learning_rate": 1.2208361127294662e-06, "loss": 0.4682, "step": 61300 }, { "epoch": 0.6786555104837907, "grad_norm": 2.7625458240509033, "learning_rate": 1.2132333795940873e-06, "loss": 0.4731, "step": 61400 }, { "epoch": 0.6797608126181292, "grad_norm": 2.0298068523406982, "learning_rate": 1.2056468013647699e-06, "loss": 0.4599, "step": 61500 }, { "epoch": 0.6808661147524676, "grad_norm": 1.9047514200210571, "learning_rate": 1.1980764732883613e-06, "loss": 0.4431, "step": 61600 }, { "epoch": 0.681971416886806, "grad_norm": 2.681807041168213, "learning_rate": 1.1905224904076873e-06, "loss": 0.4317, "step": 61700 }, { "epoch": 0.6830767190211444, "grad_norm": 1.9497393369674683, "learning_rate": 1.1829849475603683e-06, "loss": 0.4383, "step": 61800 }, { "epoch": 0.6841820211554829, "grad_norm": 1.764805555343628, "learning_rate": 1.1754639393776238e-06, "loss": 0.4375, "step": 61900 }, { "epoch": 0.6852873232898212, "grad_norm": 1.5404030084609985, "learning_rate": 1.1679595602830913e-06, "loss": 0.4419, "step": 62000 }, { "epoch": 0.6863926254241597, "grad_norm": 1.7731199264526367, "learning_rate": 1.160471904491631e-06, "loss": 0.4104, "step": 62100 }, { "epoch": 0.6874979275584981, "grad_norm": 2.781113862991333, "learning_rate": 1.153001066008149e-06, "loss": 0.4098, "step": 62200 }, { "epoch": 0.6886032296928365, "grad_norm": 2.170764207839966, "learning_rate": 1.1455471386264164e-06, "loss": 0.4386, "step": 62300 }, { "epoch": 0.689708531827175, "grad_norm": 1.8785371780395508, "learning_rate": 1.138110215927893e-06, "loss": 0.4689, "step": 62400 }, { "epoch": 0.6908138339615134, "grad_norm": 3.2463815212249756, "learning_rate": 1.1306903912805483e-06, "loss": 0.5066, "step": 62500 }, { "epoch": 0.6919191360958518, "grad_norm": 1.5964540243148804, "learning_rate": 1.123287757837691e-06, "loss": 0.4719, "step": 62600 }, { "epoch": 0.6930244382301902, "grad_norm": 2.1385936737060547, "learning_rate": 1.1159024085368031e-06, "loss": 0.4397, "step": 62700 }, { "epoch": 0.6941297403645287, "grad_norm": 1.62234628200531, "learning_rate": 1.1085344360983696e-06, "loss": 0.4167, "step": 62800 }, { "epoch": 0.695235042498867, "grad_norm": 2.0470333099365234, "learning_rate": 1.1011839330247128e-06, "loss": 0.4526, "step": 62900 }, { "epoch": 0.6963403446332055, "grad_norm": 2.6171181201934814, "learning_rate": 1.0938509915988362e-06, "loss": 0.4793, "step": 63000 }, { "epoch": 0.697445646767544, "grad_norm": 2.3599164485931396, "learning_rate": 1.08653570388326e-06, "loss": 0.4159, "step": 63100 }, { "epoch": 0.6985509489018823, "grad_norm": 2.1658973693847656, "learning_rate": 1.079238161718871e-06, "loss": 0.4399, "step": 63200 }, { "epoch": 0.6996562510362208, "grad_norm": 2.165238618850708, "learning_rate": 1.0719584567237646e-06, "loss": 0.4545, "step": 63300 }, { "epoch": 0.7007615531705592, "grad_norm": 1.8751685619354248, "learning_rate": 1.0646966802920986e-06, "loss": 0.4699, "step": 63400 }, { "epoch": 0.7018668553048976, "grad_norm": 2.2241878509521484, "learning_rate": 1.0574529235929424e-06, "loss": 0.418, "step": 63500 }, { "epoch": 0.702972157439236, "grad_norm": 2.227008104324341, "learning_rate": 1.050227277569133e-06, "loss": 0.4435, "step": 63600 }, { "epoch": 0.7040774595735745, "grad_norm": 2.7472541332244873, "learning_rate": 1.043019832936139e-06, "loss": 0.48, "step": 63700 }, { "epoch": 0.7051827617079128, "grad_norm": 1.427216649055481, "learning_rate": 1.0358306801809123e-06, "loss": 0.4621, "step": 63800 }, { "epoch": 0.7062880638422513, "grad_norm": 2.6720409393310547, "learning_rate": 1.0286599095607576e-06, "loss": 0.4494, "step": 63900 }, { "epoch": 0.7073933659765897, "grad_norm": 2.212963342666626, "learning_rate": 1.021507611102197e-06, "loss": 0.4605, "step": 64000 }, { "epoch": 0.7084986681109281, "grad_norm": 1.640894889831543, "learning_rate": 1.014373874599846e-06, "loss": 0.4313, "step": 64100 }, { "epoch": 0.7096039702452666, "grad_norm": 1.8810545206069946, "learning_rate": 1.0072587896152769e-06, "loss": 0.4316, "step": 64200 }, { "epoch": 0.710709272379605, "grad_norm": 2.1144118309020996, "learning_rate": 1.0001624454758983e-06, "loss": 0.4435, "step": 64300 }, { "epoch": 0.7118145745139434, "grad_norm": 1.9362212419509888, "learning_rate": 9.930849312738366e-07, "loss": 0.4532, "step": 64400 }, { "epoch": 0.7129198766482818, "grad_norm": 2.598273277282715, "learning_rate": 9.860263358648146e-07, "loss": 0.4611, "step": 64500 }, { "epoch": 0.7140251787826203, "grad_norm": 2.244027614593506, "learning_rate": 9.789867478670345e-07, "loss": 0.4351, "step": 64600 }, { "epoch": 0.7151304809169586, "grad_norm": 2.007619619369507, "learning_rate": 9.719662556600672e-07, "loss": 0.4419, "step": 64700 }, { "epoch": 0.7162357830512971, "grad_norm": 2.03373122215271, "learning_rate": 9.649649473837448e-07, "loss": 0.4056, "step": 64800 }, { "epoch": 0.7173410851856354, "grad_norm": 2.0532867908477783, "learning_rate": 9.579829109370506e-07, "loss": 0.4215, "step": 64900 }, { "epoch": 0.7184463873199739, "grad_norm": 2.224346876144409, "learning_rate": 9.510202339770164e-07, "loss": 0.4431, "step": 65000 }, { "epoch": 0.7195516894543124, "grad_norm": 2.053011894226074, "learning_rate": 9.440770039176212e-07, "loss": 0.4496, "step": 65100 }, { "epoch": 0.7206569915886507, "grad_norm": 2.328004837036133, "learning_rate": 9.371533079286976e-07, "loss": 0.443, "step": 65200 }, { "epoch": 0.7217622937229892, "grad_norm": 1.9584163427352905, "learning_rate": 9.302492329348348e-07, "loss": 0.4411, "step": 65300 }, { "epoch": 0.7228675958573276, "grad_norm": 1.6421287059783936, "learning_rate": 9.233648656142838e-07, "loss": 0.447, "step": 65400 }, { "epoch": 0.723972897991666, "grad_norm": 2.134143590927124, "learning_rate": 9.165002923978769e-07, "loss": 0.4494, "step": 65500 }, { "epoch": 0.7250782001260044, "grad_norm": 2.2968268394470215, "learning_rate": 9.096555994679346e-07, "loss": 0.4537, "step": 65600 }, { "epoch": 0.7261835022603429, "grad_norm": 1.4631460905075073, "learning_rate": 9.028308727571905e-07, "loss": 0.4112, "step": 65700 }, { "epoch": 0.7272888043946812, "grad_norm": 3.258443593978882, "learning_rate": 8.960261979477061e-07, "loss": 0.4292, "step": 65800 }, { "epoch": 0.7283941065290197, "grad_norm": 2.0727250576019287, "learning_rate": 8.892416604698021e-07, "loss": 0.4337, "step": 65900 }, { "epoch": 0.7294994086633582, "grad_norm": 2.1423141956329346, "learning_rate": 8.824773455009777e-07, "loss": 0.4304, "step": 66000 }, { "epoch": 0.7306047107976965, "grad_norm": 1.4535356760025024, "learning_rate": 8.757333379648491e-07, "loss": 0.405, "step": 66100 }, { "epoch": 0.731710012932035, "grad_norm": 1.9360605478286743, "learning_rate": 8.690097225300789e-07, "loss": 0.4434, "step": 66200 }, { "epoch": 0.7328153150663734, "grad_norm": 2.19547700881958, "learning_rate": 8.623065836093131e-07, "loss": 0.4207, "step": 66300 }, { "epoch": 0.7339206172007118, "grad_norm": 2.0186522006988525, "learning_rate": 8.556240053581222e-07, "loss": 0.4634, "step": 66400 }, { "epoch": 0.7350259193350502, "grad_norm": 1.845166563987732, "learning_rate": 8.489620716739436e-07, "loss": 0.4466, "step": 66500 }, { "epoch": 0.7361312214693887, "grad_norm": 2.228302001953125, "learning_rate": 8.423208661950342e-07, "loss": 0.4612, "step": 66600 }, { "epoch": 0.737236523603727, "grad_norm": 2.429689884185791, "learning_rate": 8.357004722994105e-07, "loss": 0.4108, "step": 66700 }, { "epoch": 0.7383418257380655, "grad_norm": 3.2977466583251953, "learning_rate": 8.291009731038078e-07, "loss": 0.4497, "step": 66800 }, { "epoch": 0.739447127872404, "grad_norm": 2.6713201999664307, "learning_rate": 8.22522451462637e-07, "loss": 0.4041, "step": 66900 }, { "epoch": 0.7405524300067423, "grad_norm": 1.5487697124481201, "learning_rate": 8.159649899669436e-07, "loss": 0.4521, "step": 67000 }, { "epoch": 0.7416577321410808, "grad_norm": 2.303757429122925, "learning_rate": 8.094286709433683e-07, "loss": 0.444, "step": 67100 }, { "epoch": 0.7427630342754192, "grad_norm": 2.1915831565856934, "learning_rate": 8.029135764531157e-07, "loss": 0.4364, "step": 67200 }, { "epoch": 0.7438683364097576, "grad_norm": 1.9223788976669312, "learning_rate": 7.964197882909252e-07, "loss": 0.444, "step": 67300 }, { "epoch": 0.744973638544096, "grad_norm": 2.2881598472595215, "learning_rate": 7.899473879840431e-07, "loss": 0.4276, "step": 67400 }, { "epoch": 0.7460789406784345, "grad_norm": 1.8012919425964355, "learning_rate": 7.834964567911956e-07, "loss": 0.4057, "step": 67500 }, { "epoch": 0.7471842428127728, "grad_norm": 1.6279646158218384, "learning_rate": 7.770670757015752e-07, "loss": 0.4643, "step": 67600 }, { "epoch": 0.7482895449471113, "grad_norm": 2.4971320629119873, "learning_rate": 7.706593254338174e-07, "loss": 0.4609, "step": 67700 }, { "epoch": 0.7493948470814498, "grad_norm": 1.2119097709655762, "learning_rate": 7.642732864349927e-07, "loss": 0.484, "step": 67800 }, { "epoch": 0.7505001492157881, "grad_norm": 1.7218291759490967, "learning_rate": 7.579090388795923e-07, "loss": 0.4322, "step": 67900 }, { "epoch": 0.7516054513501266, "grad_norm": 1.814095139503479, "learning_rate": 7.51566662668525e-07, "loss": 0.4391, "step": 68000 }, { "epoch": 0.752710753484465, "grad_norm": 1.9664380550384521, "learning_rate": 7.452462374281111e-07, "loss": 0.4384, "step": 68100 }, { "epoch": 0.7538160556188034, "grad_norm": 1.8115942478179932, "learning_rate": 7.389478425090845e-07, "loss": 0.4358, "step": 68200 }, { "epoch": 0.7549213577531418, "grad_norm": 2.317274570465088, "learning_rate": 7.326715569855983e-07, "loss": 0.4266, "step": 68300 }, { "epoch": 0.7560266598874802, "grad_norm": 1.416651964187622, "learning_rate": 7.264174596542262e-07, "loss": 0.4613, "step": 68400 }, { "epoch": 0.7571319620218186, "grad_norm": 2.0251598358154297, "learning_rate": 7.201856290329781e-07, "loss": 0.4353, "step": 68500 }, { "epoch": 0.7582372641561571, "grad_norm": 2.7883288860321045, "learning_rate": 7.139761433603148e-07, "loss": 0.4728, "step": 68600 }, { "epoch": 0.7593425662904955, "grad_norm": 2.3883168697357178, "learning_rate": 7.077890805941631e-07, "loss": 0.4496, "step": 68700 }, { "epoch": 0.7604478684248339, "grad_norm": 1.9152491092681885, "learning_rate": 7.016245184109374e-07, "loss": 0.4222, "step": 68800 }, { "epoch": 0.7615531705591724, "grad_norm": 1.967631459236145, "learning_rate": 6.954825342045648e-07, "loss": 0.4551, "step": 68900 }, { "epoch": 0.7626584726935107, "grad_norm": 5.269169330596924, "learning_rate": 6.893632050855153e-07, "loss": 0.4473, "step": 69000 }, { "epoch": 0.7637637748278492, "grad_norm": 2.2106597423553467, "learning_rate": 6.832666078798319e-07, "loss": 0.4272, "step": 69100 }, { "epoch": 0.7648690769621876, "grad_norm": 1.336655855178833, "learning_rate": 6.771928191281657e-07, "loss": 0.4363, "step": 69200 }, { "epoch": 0.765974379096526, "grad_norm": 2.259783983230591, "learning_rate": 6.711419150848142e-07, "loss": 0.4753, "step": 69300 }, { "epoch": 0.7670796812308645, "grad_norm": 2.4219510555267334, "learning_rate": 6.651139717167684e-07, "loss": 0.4387, "step": 69400 }, { "epoch": 0.7681849833652029, "grad_norm": 1.4461395740509033, "learning_rate": 6.591090647027551e-07, "loss": 0.4333, "step": 69500 }, { "epoch": 0.7692902854995413, "grad_norm": 2.112628221511841, "learning_rate": 6.531272694322865e-07, "loss": 0.4432, "step": 69600 }, { "epoch": 0.7703955876338797, "grad_norm": 2.398404121398926, "learning_rate": 6.471686610047149e-07, "loss": 0.4178, "step": 69700 }, { "epoch": 0.7715008897682182, "grad_norm": 1.9381033182144165, "learning_rate": 6.412333142282912e-07, "loss": 0.4319, "step": 69800 }, { "epoch": 0.7726061919025565, "grad_norm": 2.338209390640259, "learning_rate": 6.353213036192244e-07, "loss": 0.4392, "step": 69900 }, { "epoch": 0.773711494036895, "grad_norm": 2.6548027992248535, "learning_rate": 6.294327034007444e-07, "loss": 0.46, "step": 70000 }, { "epoch": 0.7748167961712334, "grad_norm": 1.4142146110534668, "learning_rate": 6.235675875021741e-07, "loss": 0.4779, "step": 70100 }, { "epoch": 0.7759220983055718, "grad_norm": 2.0672521591186523, "learning_rate": 6.177260295579962e-07, "loss": 0.438, "step": 70200 }, { "epoch": 0.7770274004399103, "grad_norm": 2.526472806930542, "learning_rate": 6.119081029069346e-07, "loss": 0.4127, "step": 70300 }, { "epoch": 0.7781327025742487, "grad_norm": 1.7942878007888794, "learning_rate": 6.061138805910272e-07, "loss": 0.4384, "step": 70400 }, { "epoch": 0.7792380047085871, "grad_norm": 3.063554286956787, "learning_rate": 6.003434353547158e-07, "loss": 0.3919, "step": 70500 }, { "epoch": 0.7803433068429255, "grad_norm": 2.0761284828186035, "learning_rate": 5.945968396439262e-07, "loss": 0.42, "step": 70600 }, { "epoch": 0.781448608977264, "grad_norm": 2.193068504333496, "learning_rate": 5.88874165605163e-07, "loss": 0.4547, "step": 70700 }, { "epoch": 0.7825539111116023, "grad_norm": 1.3570361137390137, "learning_rate": 5.831754850846039e-07, "loss": 0.4401, "step": 70800 }, { "epoch": 0.7836592132459408, "grad_norm": 1.9479831457138062, "learning_rate": 5.775008696271942e-07, "loss": 0.4558, "step": 70900 }, { "epoch": 0.7847645153802792, "grad_norm": 1.4606367349624634, "learning_rate": 5.718503904757503e-07, "loss": 0.4485, "step": 71000 }, { "epoch": 0.7858698175146176, "grad_norm": 1.7804583311080933, "learning_rate": 5.662241185700684e-07, "loss": 0.3965, "step": 71100 }, { "epoch": 0.7869751196489561, "grad_norm": 1.787216067314148, "learning_rate": 5.606221245460297e-07, "loss": 0.4349, "step": 71200 }, { "epoch": 0.7880804217832945, "grad_norm": 2.5382983684539795, "learning_rate": 5.550444787347148e-07, "loss": 0.4296, "step": 71300 }, { "epoch": 0.7891857239176329, "grad_norm": 2.524690866470337, "learning_rate": 5.494912511615205e-07, "loss": 0.4599, "step": 71400 }, { "epoch": 0.7902910260519713, "grad_norm": 1.3965719938278198, "learning_rate": 5.439625115452824e-07, "loss": 0.4503, "step": 71500 }, { "epoch": 0.7913963281863098, "grad_norm": 1.595763921737671, "learning_rate": 5.384583292973985e-07, "loss": 0.4615, "step": 71600 }, { "epoch": 0.7925016303206481, "grad_norm": 1.5032540559768677, "learning_rate": 5.329787735209566e-07, "loss": 0.4287, "step": 71700 }, { "epoch": 0.7936069324549866, "grad_norm": 1.8847301006317139, "learning_rate": 5.275239130098669e-07, "loss": 0.446, "step": 71800 }, { "epoch": 0.7947122345893249, "grad_norm": 1.5111511945724487, "learning_rate": 5.220938162480014e-07, "loss": 0.435, "step": 71900 }, { "epoch": 0.7958175367236634, "grad_norm": 2.1808974742889404, "learning_rate": 5.166885514083311e-07, "loss": 0.4365, "step": 72000 }, { "epoch": 0.7969228388580019, "grad_norm": 1.921736240386963, "learning_rate": 5.113081863520697e-07, "loss": 0.4746, "step": 72100 }, { "epoch": 0.7980281409923402, "grad_norm": 2.0888705253601074, "learning_rate": 5.059527886278246e-07, "loss": 0.4435, "step": 72200 }, { "epoch": 0.7991334431266787, "grad_norm": 2.90547776222229, "learning_rate": 5.006224254707448e-07, "loss": 0.464, "step": 72300 }, { "epoch": 0.8002387452610171, "grad_norm": 1.6634081602096558, "learning_rate": 4.953171638016821e-07, "loss": 0.4243, "step": 72400 }, { "epoch": 0.8013440473953555, "grad_norm": 1.630812644958496, "learning_rate": 4.900370702263443e-07, "loss": 0.3898, "step": 72500 }, { "epoch": 0.8024493495296939, "grad_norm": 2.4027256965637207, "learning_rate": 4.847822110344664e-07, "loss": 0.4398, "step": 72600 }, { "epoch": 0.8035546516640324, "grad_norm": 1.9806816577911377, "learning_rate": 4.795526521989705e-07, "loss": 0.475, "step": 72700 }, { "epoch": 0.8046599537983707, "grad_norm": 2.0573477745056152, "learning_rate": 4.743484593751446e-07, "loss": 0.4239, "step": 72800 }, { "epoch": 0.8057652559327092, "grad_norm": 2.6847050189971924, "learning_rate": 4.6916969789981477e-07, "loss": 0.4509, "step": 72900 }, { "epoch": 0.8068705580670477, "grad_norm": 2.843912124633789, "learning_rate": 4.6401643279052444e-07, "loss": 0.4605, "step": 73000 }, { "epoch": 0.807975860201386, "grad_norm": 2.673027276992798, "learning_rate": 4.588887287447188e-07, "loss": 0.4139, "step": 73100 }, { "epoch": 0.8090811623357245, "grad_norm": 1.7096991539001465, "learning_rate": 4.5378665013893375e-07, "loss": 0.4527, "step": 73200 }, { "epoch": 0.8101864644700629, "grad_norm": 1.959112286567688, "learning_rate": 4.4871026102798755e-07, "loss": 0.4437, "step": 73300 }, { "epoch": 0.8112917666044013, "grad_norm": 1.4862419366836548, "learning_rate": 4.436596251441738e-07, "loss": 0.4287, "step": 73400 }, { "epoch": 0.8123970687387397, "grad_norm": 2.291743278503418, "learning_rate": 4.3863480589646374e-07, "loss": 0.4279, "step": 73500 }, { "epoch": 0.8135023708730782, "grad_norm": 2.421630620956421, "learning_rate": 4.336358663697107e-07, "loss": 0.4497, "step": 73600 }, { "epoch": 0.8146076730074165, "grad_norm": 2.3377912044525146, "learning_rate": 4.286628693238576e-07, "loss": 0.4474, "step": 73700 }, { "epoch": 0.815712975141755, "grad_norm": 2.160400390625, "learning_rate": 4.237158771931468e-07, "loss": 0.4472, "step": 73800 }, { "epoch": 0.8168182772760935, "grad_norm": 2.32997465133667, "learning_rate": 4.187949520853382e-07, "loss": 0.446, "step": 73900 }, { "epoch": 0.8179235794104318, "grad_norm": 2.2677996158599854, "learning_rate": 4.139001557809308e-07, "loss": 0.4408, "step": 74000 }, { "epoch": 0.8190288815447703, "grad_norm": 1.791791558265686, "learning_rate": 4.090315497323852e-07, "loss": 0.4721, "step": 74100 }, { "epoch": 0.8201341836791087, "grad_norm": 1.590136170387268, "learning_rate": 4.041891950633514e-07, "loss": 0.4389, "step": 74200 }, { "epoch": 0.8212394858134471, "grad_norm": 1.7760423421859741, "learning_rate": 3.993731525679029e-07, "loss": 0.4682, "step": 74300 }, { "epoch": 0.8223447879477855, "grad_norm": 1.8399248123168945, "learning_rate": 3.945834827097736e-07, "loss": 0.4345, "step": 74400 }, { "epoch": 0.823450090082124, "grad_norm": 2.691328763961792, "learning_rate": 3.8982024562159854e-07, "loss": 0.4865, "step": 74500 }, { "epoch": 0.8245553922164623, "grad_norm": 2.113375425338745, "learning_rate": 3.8508350110415646e-07, "loss": 0.4288, "step": 74600 }, { "epoch": 0.8256606943508008, "grad_norm": 1.4317853450775146, "learning_rate": 3.8037330862562393e-07, "loss": 0.4465, "step": 74700 }, { "epoch": 0.8267659964851393, "grad_norm": 1.591933012008667, "learning_rate": 3.7568972732082295e-07, "loss": 0.4131, "step": 74800 }, { "epoch": 0.8278712986194776, "grad_norm": 1.7374714612960815, "learning_rate": 3.710328159904844e-07, "loss": 0.4011, "step": 74900 }, { "epoch": 0.8289766007538161, "grad_norm": 2.0382604598999023, "learning_rate": 3.664026331005044e-07, "loss": 0.4176, "step": 75000 }, { "epoch": 0.8300819028881545, "grad_norm": 2.3857359886169434, "learning_rate": 3.6179923678121537e-07, "loss": 0.4921, "step": 75100 }, { "epoch": 0.8311872050224929, "grad_norm": 2.013730764389038, "learning_rate": 3.5722268482665107e-07, "loss": 0.4365, "step": 75200 }, { "epoch": 0.8322925071568313, "grad_norm": 1.714146375656128, "learning_rate": 3.5267303469382506e-07, "loss": 0.4353, "step": 75300 }, { "epoch": 0.8333978092911697, "grad_norm": 1.6847208738327026, "learning_rate": 3.4815034350200893e-07, "loss": 0.4585, "step": 75400 }, { "epoch": 0.8345031114255081, "grad_norm": 2.0972464084625244, "learning_rate": 3.4365466803201216e-07, "loss": 0.441, "step": 75500 }, { "epoch": 0.8356084135598466, "grad_norm": 1.8610143661499023, "learning_rate": 3.3918606472547136e-07, "loss": 0.4351, "step": 75600 }, { "epoch": 0.836713715694185, "grad_norm": 2.597923755645752, "learning_rate": 3.347445896841428e-07, "loss": 0.4196, "step": 75700 }, { "epoch": 0.8378190178285234, "grad_norm": 1.8498742580413818, "learning_rate": 3.30330298669197e-07, "loss": 0.4526, "step": 75800 }, { "epoch": 0.8389243199628619, "grad_norm": 1.8387874364852905, "learning_rate": 3.259432471005175e-07, "loss": 0.4287, "step": 75900 }, { "epoch": 0.8400296220972002, "grad_norm": 2.75079083442688, "learning_rate": 3.215834900560055e-07, "loss": 0.4486, "step": 76000 }, { "epoch": 0.8411349242315387, "grad_norm": 1.793381690979004, "learning_rate": 3.1725108227089074e-07, "loss": 0.4602, "step": 76100 }, { "epoch": 0.8422402263658771, "grad_norm": 1.3438163995742798, "learning_rate": 3.129460781370422e-07, "loss": 0.4441, "step": 76200 }, { "epoch": 0.8433455285002155, "grad_norm": 2.8206710815429688, "learning_rate": 3.0866853170228443e-07, "loss": 0.3989, "step": 76300 }, { "epoch": 0.844450830634554, "grad_norm": 1.9363433122634888, "learning_rate": 3.044184966697203e-07, "loss": 0.4252, "step": 76400 }, { "epoch": 0.8455561327688924, "grad_norm": 2.5586061477661133, "learning_rate": 3.001960263970577e-07, "loss": 0.4957, "step": 76500 }, { "epoch": 0.8466614349032308, "grad_norm": 1.9901615381240845, "learning_rate": 2.960011738959387e-07, "loss": 0.4629, "step": 76600 }, { "epoch": 0.8477667370375692, "grad_norm": 1.8617513179779053, "learning_rate": 2.918339918312718e-07, "loss": 0.4515, "step": 76700 }, { "epoch": 0.8488720391719077, "grad_norm": 1.8503713607788086, "learning_rate": 2.876945325205754e-07, "loss": 0.4614, "step": 76800 }, { "epoch": 0.849977341306246, "grad_norm": 2.3590264320373535, "learning_rate": 2.835828479333164e-07, "loss": 0.4517, "step": 76900 }, { "epoch": 0.8510826434405845, "grad_norm": 1.9208427667617798, "learning_rate": 2.7949898969026114e-07, "loss": 0.4694, "step": 77000 }, { "epoch": 0.852187945574923, "grad_norm": 2.673845052719116, "learning_rate": 2.754430090628243e-07, "loss": 0.4379, "step": 77100 }, { "epoch": 0.8532932477092613, "grad_norm": 2.1295111179351807, "learning_rate": 2.714149569724295e-07, "loss": 0.4654, "step": 77200 }, { "epoch": 0.8543985498435998, "grad_norm": 2.3107078075408936, "learning_rate": 2.6741488398986384e-07, "loss": 0.4267, "step": 77300 }, { "epoch": 0.8555038519779382, "grad_norm": 2.0932328701019287, "learning_rate": 2.6344284033464976e-07, "loss": 0.4141, "step": 77400 }, { "epoch": 0.8566091541122766, "grad_norm": 1.246630072593689, "learning_rate": 2.594988758744088e-07, "loss": 0.4597, "step": 77500 }, { "epoch": 0.857714456246615, "grad_norm": 1.999973177909851, "learning_rate": 2.5558304012423954e-07, "loss": 0.4488, "step": 77600 }, { "epoch": 0.8588197583809535, "grad_norm": 1.827642798423767, "learning_rate": 2.516953822460935e-07, "loss": 0.473, "step": 77700 }, { "epoch": 0.8599250605152918, "grad_norm": 2.323723793029785, "learning_rate": 2.4783595104815954e-07, "loss": 0.4138, "step": 77800 }, { "epoch": 0.8610303626496303, "grad_norm": 2.2066116333007812, "learning_rate": 2.440047949842506e-07, "loss": 0.4466, "step": 77900 }, { "epoch": 0.8621356647839687, "grad_norm": 1.8978465795516968, "learning_rate": 2.402019621531937e-07, "loss": 0.4597, "step": 78000 }, { "epoch": 0.8632409669183071, "grad_norm": 1.499747395515442, "learning_rate": 2.364275002982286e-07, "loss": 0.4103, "step": 78100 }, { "epoch": 0.8643462690526456, "grad_norm": 1.766528606414795, "learning_rate": 2.3268145680640758e-07, "loss": 0.4416, "step": 78200 }, { "epoch": 0.865451571186984, "grad_norm": 2.050598621368408, "learning_rate": 2.2896387870800034e-07, "loss": 0.4238, "step": 78300 }, { "epoch": 0.8665568733213224, "grad_norm": 3.147510290145874, "learning_rate": 2.2527481267590274e-07, "loss": 0.4561, "step": 78400 }, { "epoch": 0.8676621754556608, "grad_norm": 2.1303939819335938, "learning_rate": 2.2161430502505133e-07, "loss": 0.4525, "step": 78500 }, { "epoch": 0.8687674775899992, "grad_norm": 2.706810235977173, "learning_rate": 2.179824017118437e-07, "loss": 0.4467, "step": 78600 }, { "epoch": 0.8698727797243376, "grad_norm": 2.3057336807250977, "learning_rate": 2.1437914833355887e-07, "loss": 0.4511, "step": 78700 }, { "epoch": 0.8709780818586761, "grad_norm": 2.322817087173462, "learning_rate": 2.1080459012778636e-07, "loss": 0.4705, "step": 78800 }, { "epoch": 0.8720833839930144, "grad_norm": 1.8831989765167236, "learning_rate": 2.0725877197185663e-07, "loss": 0.445, "step": 78900 }, { "epoch": 0.8731886861273529, "grad_norm": 2.8571081161499023, "learning_rate": 2.0374173838228013e-07, "loss": 0.4772, "step": 79000 }, { "epoch": 0.8742939882616914, "grad_norm": 2.4051854610443115, "learning_rate": 2.0025353351418753e-07, "loss": 0.4557, "step": 79100 }, { "epoch": 0.8753992903960297, "grad_norm": 1.7439450025558472, "learning_rate": 1.967942011607732e-07, "loss": 0.4421, "step": 79200 }, { "epoch": 0.8765045925303682, "grad_norm": 2.514841318130493, "learning_rate": 1.9336378475274865e-07, "loss": 0.4508, "step": 79300 }, { "epoch": 0.8776098946647066, "grad_norm": 1.8946666717529297, "learning_rate": 1.8996232735779496e-07, "loss": 0.4509, "step": 79400 }, { "epoch": 0.878715196799045, "grad_norm": 1.5840513706207275, "learning_rate": 1.865898716800238e-07, "loss": 0.4557, "step": 79500 }, { "epoch": 0.8798204989333834, "grad_norm": 1.824873924255371, "learning_rate": 1.8324646005943913e-07, "loss": 0.4662, "step": 79600 }, { "epoch": 0.8809258010677219, "grad_norm": 2.0272133350372314, "learning_rate": 1.7993213447140807e-07, "loss": 0.4291, "step": 79700 }, { "epoch": 0.8820311032020602, "grad_norm": 1.1004635095596313, "learning_rate": 1.766469365261317e-07, "loss": 0.3973, "step": 79800 }, { "epoch": 0.8831364053363987, "grad_norm": 2.014890193939209, "learning_rate": 1.7339090746812449e-07, "loss": 0.45, "step": 79900 }, { "epoch": 0.8842417074707372, "grad_norm": 2.0376179218292236, "learning_rate": 1.7016408817569606e-07, "loss": 0.4381, "step": 80000 }, { "epoch": 0.8853470096050755, "grad_norm": 1.6137086153030396, "learning_rate": 1.6696651916043666e-07, "loss": 0.4361, "step": 80100 }, { "epoch": 0.886452311739414, "grad_norm": 1.7986013889312744, "learning_rate": 1.6379824056670934e-07, "loss": 0.4719, "step": 80200 }, { "epoch": 0.8875576138737524, "grad_norm": 1.8301312923431396, "learning_rate": 1.6065929217114696e-07, "loss": 0.4262, "step": 80300 }, { "epoch": 0.8886629160080908, "grad_norm": 2.35886287689209, "learning_rate": 1.575497133821524e-07, "loss": 0.4535, "step": 80400 }, { "epoch": 0.8897682181424292, "grad_norm": 1.7016726732254028, "learning_rate": 1.5446954323940223e-07, "loss": 0.4294, "step": 80500 }, { "epoch": 0.8908735202767677, "grad_norm": 1.589161992073059, "learning_rate": 1.5141882041335737e-07, "loss": 0.4309, "step": 80600 }, { "epoch": 0.891978822411106, "grad_norm": 2.3803720474243164, "learning_rate": 1.4839758320477958e-07, "loss": 0.4318, "step": 80700 }, { "epoch": 0.8930841245454445, "grad_norm": 2.638575315475464, "learning_rate": 1.454058695442484e-07, "loss": 0.4678, "step": 80800 }, { "epoch": 0.894189426679783, "grad_norm": 1.9479451179504395, "learning_rate": 1.4244371699168453e-07, "loss": 0.4264, "step": 80900 }, { "epoch": 0.8952947288141213, "grad_norm": 1.9173952341079712, "learning_rate": 1.3951116273588e-07, "loss": 0.4507, "step": 81000 }, { "epoch": 0.8964000309484598, "grad_norm": 1.8866360187530518, "learning_rate": 1.3660824359403107e-07, "loss": 0.4359, "step": 81100 }, { "epoch": 0.8975053330827982, "grad_norm": 2.116718053817749, "learning_rate": 1.3373499601127466e-07, "loss": 0.4451, "step": 81200 }, { "epoch": 0.8986106352171366, "grad_norm": 2.32564377784729, "learning_rate": 1.308914560602323e-07, "loss": 0.4198, "step": 81300 }, { "epoch": 0.899715937351475, "grad_norm": 2.0888161659240723, "learning_rate": 1.2807765944055528e-07, "loss": 0.4543, "step": 81400 }, { "epoch": 0.9008212394858135, "grad_norm": 2.4812674522399902, "learning_rate": 1.2529364147847918e-07, "loss": 0.4323, "step": 81500 }, { "epoch": 0.9019265416201518, "grad_norm": 1.4540350437164307, "learning_rate": 1.2253943712637883e-07, "loss": 0.4429, "step": 81600 }, { "epoch": 0.9030318437544903, "grad_norm": 2.2741010189056396, "learning_rate": 1.198150809623283e-07, "loss": 0.4087, "step": 81700 }, { "epoch": 0.9041371458888288, "grad_norm": 1.891856074333191, "learning_rate": 1.1712060718966967e-07, "loss": 0.4314, "step": 81800 }, { "epoch": 0.9052424480231671, "grad_norm": 2.013892412185669, "learning_rate": 1.1445604963658041e-07, "loss": 0.413, "step": 81900 }, { "epoch": 0.9063477501575056, "grad_norm": 1.5470303297042847, "learning_rate": 1.1182144175565207e-07, "loss": 0.4102, "step": 82000 }, { "epoch": 0.9074530522918439, "grad_norm": 2.09853196144104, "learning_rate": 1.0921681662346695e-07, "loss": 0.4228, "step": 82100 }, { "epoch": 0.9085583544261824, "grad_norm": 1.8821436166763306, "learning_rate": 1.0664220694018512e-07, "loss": 0.4499, "step": 82200 }, { "epoch": 0.9096636565605208, "grad_norm": 2.268958568572998, "learning_rate": 1.0409764502913311e-07, "loss": 0.457, "step": 82300 }, { "epoch": 0.9107689586948592, "grad_norm": 2.286543607711792, "learning_rate": 1.0158316283639807e-07, "loss": 0.4531, "step": 82400 }, { "epoch": 0.9118742608291976, "grad_norm": 1.7463018894195557, "learning_rate": 9.909879193042731e-08, "loss": 0.4182, "step": 82500 }, { "epoch": 0.9129795629635361, "grad_norm": 1.9405850172042847, "learning_rate": 9.664456350163055e-08, "loss": 0.4074, "step": 82600 }, { "epoch": 0.9140848650978745, "grad_norm": 1.8400213718414307, "learning_rate": 9.422050836198904e-08, "loss": 0.4281, "step": 82700 }, { "epoch": 0.9151901672322129, "grad_norm": 2.0934810638427734, "learning_rate": 9.182665694467019e-08, "loss": 0.4394, "step": 82800 }, { "epoch": 0.9162954693665514, "grad_norm": 1.6910539865493774, "learning_rate": 8.946303930364386e-08, "loss": 0.4511, "step": 82900 }, { "epoch": 0.9174007715008897, "grad_norm": 1.2215235233306885, "learning_rate": 8.712968511330439e-08, "loss": 0.4427, "step": 83000 }, { "epoch": 0.9185060736352282, "grad_norm": 1.4822089672088623, "learning_rate": 8.482662366809947e-08, "loss": 0.4029, "step": 83100 }, { "epoch": 0.9196113757695666, "grad_norm": 1.9130114316940308, "learning_rate": 8.255388388216267e-08, "loss": 0.4471, "step": 83200 }, { "epoch": 0.920716677903905, "grad_norm": 1.6017576456069946, "learning_rate": 8.031149428894936e-08, "loss": 0.449, "step": 83300 }, { "epoch": 0.9218219800382434, "grad_norm": 1.9857609272003174, "learning_rate": 7.80994830408785e-08, "loss": 0.4505, "step": 83400 }, { "epoch": 0.9229272821725819, "grad_norm": 1.8383105993270874, "learning_rate": 7.59178779089792e-08, "loss": 0.4387, "step": 83500 }, { "epoch": 0.9240325843069203, "grad_norm": 1.5734336376190186, "learning_rate": 7.376670628254368e-08, "loss": 0.4456, "step": 83600 }, { "epoch": 0.9251378864412587, "grad_norm": 1.7729212045669556, "learning_rate": 7.16459951687809e-08, "loss": 0.4252, "step": 83700 }, { "epoch": 0.9262431885755972, "grad_norm": 2.0925188064575195, "learning_rate": 6.955577119247909e-08, "loss": 0.4397, "step": 83800 }, { "epoch": 0.9273484907099355, "grad_norm": 1.6489801406860352, "learning_rate": 6.749606059567177e-08, "loss": 0.4241, "step": 83900 }, { "epoch": 0.928453792844274, "grad_norm": 2.122025728225708, "learning_rate": 6.546688923730587e-08, "loss": 0.4509, "step": 84000 }, { "epoch": 0.9295590949786124, "grad_norm": 1.5546257495880127, "learning_rate": 6.346828259292114e-08, "loss": 0.4283, "step": 84100 }, { "epoch": 0.9306643971129508, "grad_norm": 1.4313548803329468, "learning_rate": 6.150026575432622e-08, "loss": 0.4315, "step": 84200 }, { "epoch": 0.9317696992472893, "grad_norm": 2.144721269607544, "learning_rate": 5.956286342928608e-08, "loss": 0.4046, "step": 84300 }, { "epoch": 0.9328750013816277, "grad_norm": 1.9185172319412231, "learning_rate": 5.7656099941210966e-08, "loss": 0.4762, "step": 84400 }, { "epoch": 0.9339803035159661, "grad_norm": 2.0306639671325684, "learning_rate": 5.577999922885158e-08, "loss": 0.4347, "step": 84500 }, { "epoch": 0.9350856056503045, "grad_norm": 2.1696221828460693, "learning_rate": 5.393458484599823e-08, "loss": 0.4654, "step": 84600 }, { "epoch": 0.936190907784643, "grad_norm": 3.1747541427612305, "learning_rate": 5.2119879961184114e-08, "loss": 0.4361, "step": 84700 }, { "epoch": 0.9372962099189813, "grad_norm": 2.4681639671325684, "learning_rate": 5.033590735739641e-08, "loss": 0.4064, "step": 84800 }, { "epoch": 0.9384015120533198, "grad_norm": 2.34089732170105, "learning_rate": 4.858268943178868e-08, "loss": 0.4839, "step": 84900 }, { "epoch": 0.9395068141876582, "grad_norm": 2.1032681465148926, "learning_rate": 4.686024819540058e-08, "loss": 0.4256, "step": 85000 }, { "epoch": 0.9406121163219966, "grad_norm": 2.1643483638763428, "learning_rate": 4.5168605272881414e-08, "loss": 0.4503, "step": 85100 }, { "epoch": 0.941717418456335, "grad_norm": 1.97984778881073, "learning_rate": 4.350778190221699e-08, "loss": 0.424, "step": 85200 }, { "epoch": 0.9428227205906735, "grad_norm": 2.1957056522369385, "learning_rate": 4.187779893446597e-08, "loss": 0.4226, "step": 85300 }, { "epoch": 0.9439280227250119, "grad_norm": 2.0904030799865723, "learning_rate": 4.027867683349618e-08, "loss": 0.4394, "step": 85400 }, { "epoch": 0.9450333248593503, "grad_norm": 1.8033450841903687, "learning_rate": 3.87104356757273e-08, "loss": 0.4806, "step": 85500 }, { "epoch": 0.9461386269936887, "grad_norm": 1.6405876874923706, "learning_rate": 3.717309514988027e-08, "loss": 0.4618, "step": 85600 }, { "epoch": 0.9472439291280271, "grad_norm": 2.6198575496673584, "learning_rate": 3.566667455672912e-08, "loss": 0.4313, "step": 85700 }, { "epoch": 0.9483492312623656, "grad_norm": 1.9811842441558838, "learning_rate": 3.4191192808858966e-08, "loss": 0.4057, "step": 85800 }, { "epoch": 0.9494545333967039, "grad_norm": 1.8522582054138184, "learning_rate": 3.27466684304284e-08, "loss": 0.4433, "step": 85900 }, { "epoch": 0.9505598355310424, "grad_norm": 2.586599826812744, "learning_rate": 3.133311955693691e-08, "loss": 0.414, "step": 86000 }, { "epoch": 0.9516651376653809, "grad_norm": 1.4519222974777222, "learning_rate": 2.995056393499757e-08, "loss": 0.4333, "step": 86100 }, { "epoch": 0.9527704397997192, "grad_norm": 2.7613425254821777, "learning_rate": 2.859901892211442e-08, "loss": 0.4776, "step": 86200 }, { "epoch": 0.9538757419340577, "grad_norm": 1.8202546834945679, "learning_rate": 2.7278501486463216e-08, "loss": 0.4269, "step": 86300 }, { "epoch": 0.9549810440683961, "grad_norm": 2.257310390472412, "learning_rate": 2.598902820667992e-08, "loss": 0.4069, "step": 86400 }, { "epoch": 0.9560863462027345, "grad_norm": 2.6993019580841064, "learning_rate": 2.4730615271651716e-08, "loss": 0.413, "step": 86500 }, { "epoch": 0.9571916483370729, "grad_norm": 1.3624522686004639, "learning_rate": 2.3503278480313806e-08, "loss": 0.4277, "step": 86600 }, { "epoch": 0.9582969504714114, "grad_norm": 1.7707417011260986, "learning_rate": 2.230703324145156e-08, "loss": 0.4512, "step": 86700 }, { "epoch": 0.9594022526057497, "grad_norm": 1.9722903966903687, "learning_rate": 2.1141894573507014e-08, "loss": 0.4333, "step": 86800 }, { "epoch": 0.9605075547400882, "grad_norm": 2.174100399017334, "learning_rate": 2.000787710438934e-08, "loss": 0.473, "step": 86900 }, { "epoch": 0.9616128568744267, "grad_norm": 2.9068939685821533, "learning_rate": 1.8904995071292455e-08, "loss": 0.4919, "step": 87000 }, { "epoch": 0.962718159008765, "grad_norm": 2.1874163150787354, "learning_rate": 1.7833262320515744e-08, "loss": 0.4501, "step": 87100 }, { "epoch": 0.9638234611431035, "grad_norm": 2.9089388847351074, "learning_rate": 1.6792692307289747e-08, "loss": 0.4295, "step": 87200 }, { "epoch": 0.9649287632774419, "grad_norm": 2.2932639122009277, "learning_rate": 1.578329809560797e-08, "loss": 0.4246, "step": 87300 }, { "epoch": 0.9660340654117803, "grad_norm": 1.8238743543624878, "learning_rate": 1.4805092358062822e-08, "loss": 0.4535, "step": 87400 }, { "epoch": 0.9671393675461187, "grad_norm": 2.673421859741211, "learning_rate": 1.3858087375686335e-08, "loss": 0.4606, "step": 87500 }, { "epoch": 0.9682446696804572, "grad_norm": 2.192293405532837, "learning_rate": 1.2942295037795261e-08, "loss": 0.4632, "step": 87600 }, { "epoch": 0.9693499718147955, "grad_norm": 2.283832550048828, "learning_rate": 1.2057726841842865e-08, "loss": 0.444, "step": 87700 }, { "epoch": 0.970455273949134, "grad_norm": 1.8313320875167847, "learning_rate": 1.1204393893274878e-08, "loss": 0.415, "step": 87800 }, { "epoch": 0.9715605760834725, "grad_norm": 1.7791038751602173, "learning_rate": 1.0382306905388495e-08, "loss": 0.4252, "step": 87900 }, { "epoch": 0.9726658782178108, "grad_norm": 2.295269250869751, "learning_rate": 9.591476199199146e-09, "loss": 0.4614, "step": 88000 }, { "epoch": 0.9737711803521493, "grad_norm": 3.02966046333313, "learning_rate": 8.831911703310047e-09, "loss": 0.4545, "step": 88100 }, { "epoch": 0.9748764824864877, "grad_norm": 2.004098653793335, "learning_rate": 8.103622953789247e-09, "loss": 0.4399, "step": 88200 }, { "epoch": 0.9759817846208261, "grad_norm": 2.338454008102417, "learning_rate": 7.406619094047496e-09, "loss": 0.4413, "step": 88300 }, { "epoch": 0.9770870867551645, "grad_norm": 1.626102089881897, "learning_rate": 6.740908874725005e-09, "loss": 0.4362, "step": 88400 }, { "epoch": 0.978192388889503, "grad_norm": 1.9277746677398682, "learning_rate": 6.106500653581815e-09, "loss": 0.4365, "step": 88500 }, { "epoch": 0.9792976910238413, "grad_norm": 2.415738105773926, "learning_rate": 5.503402395391489e-09, "loss": 0.4642, "step": 88600 }, { "epoch": 0.9804029931581798, "grad_norm": 1.5694254636764526, "learning_rate": 4.931621671842301e-09, "loss": 0.441, "step": 88700 }, { "epoch": 0.9815082952925183, "grad_norm": 1.3973413705825806, "learning_rate": 4.391165661442043e-09, "loss": 0.4311, "step": 88800 }, { "epoch": 0.9826135974268566, "grad_norm": 1.9460673332214355, "learning_rate": 3.882041149427251e-09, "loss": 0.4422, "step": 88900 }, { "epoch": 0.9837188995611951, "grad_norm": 1.6558293104171753, "learning_rate": 3.404254527678286e-09, "loss": 0.423, "step": 89000 }, { "epoch": 0.9848242016955334, "grad_norm": 1.9977773427963257, "learning_rate": 2.957811794639942e-09, "loss": 0.456, "step": 89100 }, { "epoch": 0.9859295038298719, "grad_norm": 2.308818817138672, "learning_rate": 2.5427185552448496e-09, "loss": 0.4504, "step": 89200 }, { "epoch": 0.9870348059642103, "grad_norm": 3.172938108444214, "learning_rate": 2.158980020843804e-09, "loss": 0.476, "step": 89300 }, { "epoch": 0.9881401080985487, "grad_norm": 1.7046418190002441, "learning_rate": 1.8066010091402631e-09, "loss": 0.4402, "step": 89400 }, { "epoch": 0.9892454102328871, "grad_norm": 2.2551283836364746, "learning_rate": 1.485585944129564e-09, "loss": 0.4364, "step": 89500 }, { "epoch": 0.9903507123672256, "grad_norm": 2.2183637619018555, "learning_rate": 1.1959388560445207e-09, "loss": 0.4685, "step": 89600 }, { "epoch": 0.991456014501564, "grad_norm": 1.7988057136535645, "learning_rate": 9.376633813026891e-10, "loss": 0.445, "step": 89700 }, { "epoch": 0.9925613166359024, "grad_norm": 2.6220619678497314, "learning_rate": 7.107627624627911e-10, "loss": 0.4628, "step": 89800 }, { "epoch": 0.9936666187702409, "grad_norm": 2.123908519744873, "learning_rate": 5.152398481828025e-10, "loss": 0.4388, "step": 89900 }, { "epoch": 0.9947719209045792, "grad_norm": 2.005134344100952, "learning_rate": 3.510970931849822e-10, "loss": 0.4415, "step": 90000 }, { "epoch": 0.9958772230389177, "grad_norm": 2.5748660564422607, "learning_rate": 2.1833655822423027e-10, "loss": 0.4225, "step": 90100 }, { "epoch": 0.9969825251732561, "grad_norm": 1.9123564958572388, "learning_rate": 1.169599100625529e-10, "loss": 0.4542, "step": 90200 }, { "epoch": 0.9980878273075945, "grad_norm": 2.8671979904174805, "learning_rate": 4.6968421448523313e-11, "loss": 0.467, "step": 90300 }, { "epoch": 0.999193129441933, "grad_norm": 1.6871434450149536, "learning_rate": 8.362971101183448e-12, "loss": 0.437, "step": 90400 } ], "logging_steps": 100, "max_steps": 90473, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.968302960287416e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }