Upload exp_phase8_bridge_6layer_r128_20260503_014402/log.jsonl with huggingface_hub
Browse files
exp_phase8_bridge_6layer_r128_20260503_014402/log.jsonl
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"step": 0, "loss": 1.3259265422821045, "loss_mdlm": 1.3259265422821045, "lr": 0.0, "gnorm": 0.0008407659479416907, "bridge_gate_avg": 1.0, "bridge_out_proj_avg": 0.0, "elapsed_s": 3.4532642364501953}
|
| 2 |
+
{"step": 25, "loss": 0.9667863845825195, "loss_mdlm": 0.9667863845825195, "lr": 6.25e-06, "gnorm": 0.01560900080949068, "bridge_gate_avg": 1.000022570292155, "bridge_out_proj_avg": 2.4417680833721533e-05, "elapsed_s": 11.888298034667969}
|
| 3 |
+
{"step": 50, "loss": 1.4530165195465088, "loss_mdlm": 1.4530165195465088, "lr": 1.25e-05, "gnorm": 0.0035597558598965406, "bridge_gate_avg": 1.0001400709152222, "bridge_out_proj_avg": 6.380853907709631e-05, "elapsed_s": 20.20055603981018}
|
| 4 |
+
{"step": 75, "loss": 1.3447425365447998, "loss_mdlm": 1.3447425365447998, "lr": 1.8750000000000002e-05, "gnorm": 0.0024694984313100576, "bridge_gate_avg": 1.000496466954549, "bridge_out_proj_avg": 0.00017503371054772288, "elapsed_s": 28.820900678634644}
|
| 5 |
+
{"step": 100, "loss": 1.3353159427642822, "loss_mdlm": 1.3353159427642822, "lr": 2.5e-05, "gnorm": 0.0035887861158698797, "bridge_gate_avg": 1.0007124344507854, "bridge_out_proj_avg": 0.00029597835964523256, "elapsed_s": 37.201342821121216}
|
| 6 |
+
{"step": 125, "loss": 1.3480483293533325, "loss_mdlm": 1.3480483293533325, "lr": 3.125e-05, "gnorm": 0.004155014641582966, "bridge_gate_avg": 1.001103679339091, "bridge_out_proj_avg": 0.0004748403832005958, "elapsed_s": 45.25407409667969}
|
| 7 |
+
{"step": 150, "loss": 1.3248317241668701, "loss_mdlm": 1.3248317241668701, "lr": 3.7500000000000003e-05, "gnorm": 0.0028952741995453835, "bridge_gate_avg": 1.0018798112869263, "bridge_out_proj_avg": 0.0008492133832381418, "elapsed_s": 53.79943251609802}
|
| 8 |
+
{"step": 175, "loss": 1.332851767539978, "loss_mdlm": 1.332851767539978, "lr": 4.375e-05, "gnorm": 0.011863724328577518, "bridge_gate_avg": 1.0021581848462422, "bridge_out_proj_avg": 0.0009876508653784792, "elapsed_s": 62.04536175727844}
|
| 9 |
+
{"step": 200, "loss": 1.2901774644851685, "loss_mdlm": 1.2901774644851685, "lr": 5e-05, "gnorm": 0.005681826267391443, "bridge_gate_avg": 1.0025527079900105, "bridge_out_proj_avg": 0.0011195188271813095, "elapsed_s": 70.46946024894714}
|
| 10 |
+
{"step": 225, "loss": 1.3311935663223267, "loss_mdlm": 1.3311935663223267, "lr": 4.999016565957633e-05, "gnorm": 0.010893627069890499, "bridge_gate_avg": 1.002362569173177, "bridge_out_proj_avg": 0.0011245628508428733, "elapsed_s": 78.51280760765076}
|
| 11 |
+
{"step": 250, "loss": 1.3240993022918701, "loss_mdlm": 1.3240993022918701, "lr": 4.996067037544542e-05, "gnorm": 0.006826768163591623, "bridge_gate_avg": 1.0026812354723613, "bridge_out_proj_avg": 0.0011897839528198044, "elapsed_s": 86.77064990997314}
|
| 12 |
+
{"step": 275, "loss": 1.3625601530075073, "loss_mdlm": 1.3625601530075073, "lr": 4.991153735294049e-05, "gnorm": 0.013468630611896515, "bridge_gate_avg": 1.0031238396962483, "bridge_out_proj_avg": 0.0012399129142674308, "elapsed_s": 94.84146928787231}
|
| 13 |
+
{"step": 300, "loss": 1.287082552909851, "loss_mdlm": 1.287082552909851, "lr": 4.984280524733107e-05, "gnorm": 0.006170280743390322, "bridge_gate_avg": 1.0032566785812378, "bridge_out_proj_avg": 0.001269913782986502, "elapsed_s": 102.73384594917297}
|
| 14 |
+
{"step": 325, "loss": 1.3137075901031494, "loss_mdlm": 1.3137075901031494, "lr": 4.975452813341114e-05, "gnorm": 0.008435830473899841, "bridge_gate_avg": 1.0032087167104085, "bridge_out_proj_avg": 0.0012916803825646639, "elapsed_s": 111.17196273803711}
|
| 15 |
+
{"step": 350, "loss": 1.2115696668624878, "loss_mdlm": 1.2115696668624878, "lr": 4.96467754629559e-05, "gnorm": 0.09166917949914932, "bridge_gate_avg": 1.0034073789914448, "bridge_out_proj_avg": 0.0013306861510500312, "elapsed_s": 119.2549455165863}
|
| 16 |
+
{"step": 375, "loss": 1.2221903800964355, "loss_mdlm": 1.2221903800964355, "lr": 4.951963201008076e-05, "gnorm": 0.04713674262166023, "bridge_gate_avg": 1.0038543343544006, "bridge_out_proj_avg": 0.001369715842884034, "elapsed_s": 127.5653281211853}
|
| 17 |
+
{"step": 400, "loss": 1.3611942529678345, "loss_mdlm": 1.3611942529678345, "lr": 4.937319780454559e-05, "gnorm": 0.011368649080395699, "bridge_gate_avg": 1.0041199525197346, "bridge_out_proj_avg": 0.0013951143288674455, "elapsed_s": 135.64750909805298}
|
| 18 |
+
{"step": 425, "loss": 1.439297080039978, "loss_mdlm": 1.439297080039978, "lr": 4.9207588053056545e-05, "gnorm": 0.015016973949968815, "bridge_gate_avg": 1.0045283834139507, "bridge_out_proj_avg": 0.0014268460217863321, "elapsed_s": 144.03881406784058}
|
| 19 |
+
{"step": 450, "loss": 1.364678978919983, "loss_mdlm": 1.364678978919983, "lr": 4.9022933048627496e-05, "gnorm": 0.018032994121313095, "bridge_gate_avg": 1.0047624905904133, "bridge_out_proj_avg": 0.001450096412251393, "elapsed_s": 152.31860518455505}
|
| 20 |
+
{"step": 475, "loss": 1.4881820678710938, "loss_mdlm": 1.4881820678710938, "lr": 4.881937806807241e-05, "gnorm": 0.015777165070176125, "bridge_gate_avg": 1.004913369814555, "bridge_out_proj_avg": 0.0014599599526263773, "elapsed_s": 160.27463722229004}
|
| 21 |
+
{"step": 500, "loss": 1.3133773803710938, "loss_mdlm": 1.3133773803710938, "lr": 4.8597083257709194e-05, "gnorm": 0.01622500829398632, "bridge_gate_avg": 1.0050513744354248, "bridge_out_proj_avg": 0.0014693999546580017, "elapsed_s": 168.69454765319824}
|
| 22 |
+
{"step": 525, "loss": 1.3761178255081177, "loss_mdlm": 1.3761178255081177, "lr": 4.8356223507364996e-05, "gnorm": 0.020520636811852455, "bridge_gate_avg": 1.0053996443748474, "bridge_out_proj_avg": 0.0014988186497551699, "elapsed_s": 176.99373316764832}
|
| 23 |
+
{"step": 550, "loss": 1.3680384159088135, "loss_mdlm": 1.3680384159088135, "lr": 4.8096988312782174e-05, "gnorm": 0.02605566754937172, "bridge_gate_avg": 1.0057567755381267, "bridge_out_proj_avg": 0.0015208408003672957, "elapsed_s": 185.37195777893066}
|
| 24 |
+
{"step": 575, "loss": 1.3393467664718628, "loss_mdlm": 1.3393467664718628, "lr": 4.781958162653297e-05, "gnorm": 0.016667336225509644, "bridge_gate_avg": 1.0059468150138855, "bridge_out_proj_avg": 0.0015286272197651367, "elapsed_s": 193.47247219085693}
|
| 25 |
+
{"step": 600, "loss": 1.3628203868865967, "loss_mdlm": 1.3628203868865967, "lr": 4.752422169756048e-05, "gnorm": 0.015346413478255272, "bridge_gate_avg": 1.006137450536092, "bridge_out_proj_avg": 0.0015394391763644915, "elapsed_s": 201.58957934379578}
|
| 26 |
+
{"step": 625, "loss": 1.3041877746582031, "loss_mdlm": 1.3041877746582031, "lr": 4.721114089947181e-05, "gnorm": 0.013416657224297523, "bridge_gate_avg": 1.006204644838969, "bridge_out_proj_avg": 0.0015502419943610828, "elapsed_s": 209.70430517196655}
|
| 27 |
+
{"step": 650, "loss": 1.2911816835403442, "loss_mdlm": 1.2911816835403442, "lr": 4.6880585547718845e-05, "gnorm": 0.013433833606541157, "bridge_gate_avg": 1.006552000840505, "bridge_out_proj_avg": 0.0015662233151185017, "elapsed_s": 218.01599264144897}
|
| 28 |
+
{"step": 675, "loss": 1.3501548767089844, "loss_mdlm": 1.3501548767089844, "lr": 4.653281570581023e-05, "gnorm": 0.013700229115784168, "bridge_gate_avg": 1.0067185759544373, "bridge_out_proj_avg": 0.0015787525141301255, "elapsed_s": 225.98654556274414}
|
| 29 |
+
{"step": 700, "loss": 1.3161166906356812, "loss_mdlm": 1.3161166906356812, "lr": 4.6168104980707107e-05, "gnorm": 0.011203687638044357, "bridge_gate_avg": 1.006902853647868, "bridge_out_proj_avg": 0.0015887953923083842, "elapsed_s": 234.3763611316681}
|
| 30 |
+
{"step": 725, "loss": 1.3370239734649658, "loss_mdlm": 1.3370239734649658, "lr": 4.5786740307563636e-05, "gnorm": 0.01584548130631447, "bridge_gate_avg": 1.0069229205449421, "bridge_out_proj_avg": 0.0016003404937994976, "elapsed_s": 242.55514550209045}
|
| 31 |
+
{"step": 750, "loss": 1.3557748794555664, "loss_mdlm": 1.3557748794555664, "lr": 4.538902172398151e-05, "gnorm": 0.02942849136888981, "bridge_gate_avg": 1.006959040959676, "bridge_out_proj_avg": 0.0016093689870710175, "elapsed_s": 250.68170762062073}
|
| 32 |
+
{"step": 775, "loss": 1.297780156135559, "loss_mdlm": 1.297780156135559, "lr": 4.497526213395623e-05, "gnorm": 0.01657259091734886, "bridge_gate_avg": 1.0068645278612773, "bridge_out_proj_avg": 0.001620987042163809, "elapsed_s": 258.88362526893616}
|
| 33 |
+
{"step": 800, "loss": 1.2860358953475952, "loss_mdlm": 1.2860358953475952, "lr": 4.454578706170075e-05, "gnorm": 0.017908962443470955, "bridge_gate_avg": 1.0069910089174907, "bridge_out_proj_avg": 0.0016270160170582433, "elapsed_s": 266.7442443370819}
|
| 34 |
+
{"step": 825, "loss": 1.2961452007293701, "loss_mdlm": 1.2961452007293701, "lr": 4.410093439554019e-05, "gnorm": 0.018056141212582588, "bridge_gate_avg": 1.0070714553197224, "bridge_out_proj_avg": 0.0016343465734583635, "elapsed_s": 274.8778727054596}
|
| 35 |
+
{"step": 850, "loss": 1.403436541557312, "loss_mdlm": 1.403436541557312, "lr": 4.364105412207914e-05, "gnorm": 0.01024180743843317, "bridge_gate_avg": 1.0072417855262756, "bridge_out_proj_avg": 0.0016412942786701024, "elapsed_s": 283.18827652931213}
|
| 36 |
+
{"step": 875, "loss": 1.354021668434143, "loss_mdlm": 1.354021668434143, "lr": 4.316650805085068e-05, "gnorm": 0.00954077672213316, "bridge_gate_avg": 1.0075311263402302, "bridge_out_proj_avg": 0.0016519549729612966, "elapsed_s": 291.1498975753784}
|
| 37 |
+
{"step": 900, "loss": 1.293360710144043, "loss_mdlm": 1.293360710144043, "lr": 4.267766952966369e-05, "gnorm": 0.02152811549603939, "bridge_gate_avg": 1.0076918204625447, "bridge_out_proj_avg": 0.001660117704886943, "elapsed_s": 299.03033995628357}
|
| 38 |
+
{"step": 925, "loss": 1.308851957321167, "loss_mdlm": 1.308851957321167, "lr": 4.2174923150872544e-05, "gnorm": 0.011474204249680042, "bridge_gate_avg": 1.0079240401585896, "bridge_out_proj_avg": 0.0016699178959243, "elapsed_s": 307.14173769950867}
|
| 39 |
+
{"step": 950, "loss": 1.394288420677185, "loss_mdlm": 1.394288420677185, "lr": 4.16586644488001e-05, "gnorm": 0.017715131863951683, "bridge_gate_avg": 1.0081361929575603, "bridge_out_proj_avg": 0.001680508372373879, "elapsed_s": 314.95003628730774}
|
| 40 |
+
{"step": 975, "loss": 1.3472524881362915, "loss_mdlm": 1.3472524881362915, "lr": 4.1129299588552193e-05, "gnorm": 0.022405602037906647, "bridge_gate_avg": 1.0082210501035054, "bridge_out_proj_avg": 0.0016854821781938274, "elapsed_s": 322.9189558029175}
|
| 41 |
+
{"step": 1000, "loss": 1.344576120376587, "loss_mdlm": 1.344576120376587, "lr": 4.058724504646834e-05, "gnorm": 0.018821051344275475, "bridge_gate_avg": 1.0080793897310893, "bridge_out_proj_avg": 0.0016935796787341435, "elapsed_s": 331.12781262397766}
|
| 42 |
+
{"step": 1025, "loss": 1.308237075805664, "loss_mdlm": 1.308237075805664, "lr": 4.0032927282460146e-05, "gnorm": 0.012080981396138668, "bridge_gate_avg": 1.0081587235132854, "bridge_out_proj_avg": 0.0016997390387890239, "elapsed_s": 339.1649692058563}
|
| 43 |
+
{"step": 1050, "loss": 1.378395915031433, "loss_mdlm": 1.378395915031433, "lr": 3.946678240449515e-05, "gnorm": 0.024929288774728775, "bridge_gate_avg": 1.0083387891451518, "bridge_out_proj_avg": 0.001706016172344486, "elapsed_s": 347.27006673812866}
|
| 44 |
+
{"step": 1075, "loss": 1.2798488140106201, "loss_mdlm": 1.2798488140106201, "lr": 3.888925582549006e-05, "gnorm": 0.020891953259706497, "bridge_gate_avg": 1.0083383321762085, "bridge_out_proj_avg": 0.0017147226414332788, "elapsed_s": 355.41885900497437}
|
| 45 |
+
{"step": 1100, "loss": 1.3566333055496216, "loss_mdlm": 1.3566333055496216, "lr": 3.830080191288342e-05, "gnorm": 0.01759035512804985, "bridge_gate_avg": 1.0084609587987263, "bridge_out_proj_avg": 0.0017225509315418701, "elapsed_s": 363.6525754928589}
|
| 46 |
+
{"step": 1125, "loss": 1.3159223794937134, "loss_mdlm": 1.3159223794937134, "lr": 3.770188363116324e-05, "gnorm": 0.014562332071363926, "bridge_gate_avg": 1.0084339181582134, "bridge_out_proj_avg": 0.0017301546758972108, "elapsed_s": 372.0996630191803}
|
| 47 |
+
{"step": 1150, "loss": 1.3739136457443237, "loss_mdlm": 1.3739136457443237, "lr": 3.7092972177631e-05, "gnorm": 0.015484306961297989, "bridge_gate_avg": 1.0084357659022014, "bridge_out_proj_avg": 0.00173241610173136, "elapsed_s": 380.32668805122375}
|
| 48 |
+
{"step": 1175, "loss": 1.3864264488220215, "loss_mdlm": 1.3864264488220215, "lr": 3.6474546611688445e-05, "gnorm": 0.03088589943945408, "bridge_gate_avg": 1.0084308981895447, "bridge_out_proj_avg": 0.0017467282402018707, "elapsed_s": 388.5576000213623}
|
| 49 |
+
{"step": 1200, "loss": 1.3416414260864258, "loss_mdlm": 1.3416414260864258, "lr": 3.5847093477938956e-05, "gnorm": 0.012309256009757519, "bridge_gate_avg": 1.0083843270937602, "bridge_out_proj_avg": 0.0017565450204225879, "elapsed_s": 396.8285048007965}
|
| 50 |
+
{"step": 1225, "loss": 1.3198059797286987, "loss_mdlm": 1.3198059797286987, "lr": 3.521110642339991e-05, "gnorm": 0.018513798713684082, "bridge_gate_avg": 1.0085140268007915, "bridge_out_proj_avg": 0.001764647817860047, "elapsed_s": 405.1885645389557}
|
| 51 |
+
{"step": 1250, "loss": 1.3081120252609253, "loss_mdlm": 1.3081120252609253, "lr": 3.456708580912725e-05, "gnorm": 0.010892514139413834, "bridge_gate_avg": 1.0086103677749634, "bridge_out_proj_avg": 0.0017706514336168766, "elapsed_s": 413.2898516654968}
|
| 52 |
+
{"step": 1275, "loss": 1.311785340309143, "loss_mdlm": 1.311785340309143, "lr": 3.391553831655782e-05, "gnorm": 0.01278949249535799, "bridge_gate_avg": 1.0085352460543315, "bridge_out_proj_avg": 0.0017718830689166982, "elapsed_s": 421.36319160461426}
|
| 53 |
+
{"step": 1300, "loss": 1.3287440538406372, "loss_mdlm": 1.3287440538406372, "lr": 3.3256976548879184e-05, "gnorm": 0.019487539306282997, "bridge_gate_avg": 1.0084249377250671, "bridge_out_proj_avg": 0.0017750685995755096, "elapsed_s": 429.60070180892944}
|
| 54 |
+
{"step": 1325, "loss": 1.2901521921157837, "loss_mdlm": 1.2901521921157837, "lr": 3.259191862774037e-05, "gnorm": 0.01654678024351597, "bridge_gate_avg": 1.0086421569188435, "bridge_out_proj_avg": 0.0017795986495912075, "elapsed_s": 437.9385085105896}
|
| 55 |
+
{"step": 1350, "loss": 1.2993321418762207, "loss_mdlm": 1.2993321418762207, "lr": 3.1920887785621235e-05, "gnorm": 0.013396786525845528, "bridge_gate_avg": 1.0086382428805034, "bridge_out_proj_avg": 0.0017818588336619239, "elapsed_s": 446.1899321079254}
|
| 56 |
+
{"step": 1375, "loss": 1.3310885429382324, "loss_mdlm": 1.3310885429382324, "lr": 3.1244411954180676e-05, "gnorm": 0.01752319745719433, "bridge_gate_avg": 1.0085743467013042, "bridge_out_proj_avg": 0.0017854247513848047, "elapsed_s": 454.33872509002686}
|
| 57 |
+
{"step": 1400, "loss": 1.2892560958862305, "loss_mdlm": 1.2892560958862305, "lr": 3.056302334890786e-05, "gnorm": 0.030790848657488823, "bridge_gate_avg": 1.0087504585584004, "bridge_out_proj_avg": 0.0017849190820318956, "elapsed_s": 462.4736764431}
|
| 58 |
+
{"step": 1425, "loss": 1.3179521560668945, "loss_mdlm": 1.3179521560668945, "lr": 2.9877258050403212e-05, "gnorm": 0.02639157697558403, "bridge_gate_avg": 1.008877952893575, "bridge_out_proj_avg": 0.001787332643289119, "elapsed_s": 470.8544433116913}
|
| 59 |
+
{"step": 1450, "loss": 1.345336675643921, "loss_mdlm": 1.345336675643921, "lr": 2.918765558261841e-05, "gnorm": 0.019193870946764946, "bridge_gate_avg": 1.008835991223653, "bridge_out_proj_avg": 0.0017913920843663316, "elapsed_s": 479.07552695274353}
|
| 60 |
+
{"step": 1475, "loss": 1.3317831754684448, "loss_mdlm": 1.3317831754684448, "lr": 2.849475848838749e-05, "gnorm": 0.02050572633743286, "bridge_gate_avg": 1.0089526971181233, "bridge_out_proj_avg": 0.0017980645837572713, "elapsed_s": 487.2965576648712}
|
| 61 |
+
{"step": 1500, "loss": 1.346705675125122, "loss_mdlm": 1.346705675125122, "lr": 2.7799111902582696e-05, "gnorm": 0.020214449614286423, "bridge_gate_avg": 1.0090652505556743, "bridge_out_proj_avg": 0.0018048258498311043, "elapsed_s": 495.61996388435364}
|
| 62 |
+
{"step": 1525, "loss": 1.3497341871261597, "loss_mdlm": 1.3497341871261597, "lr": 2.710126312323119e-05, "gnorm": 0.013137716799974442, "bridge_gate_avg": 1.0091297030448914, "bridge_out_proj_avg": 0.001805088094746073, "elapsed_s": 503.91157126426697}
|
| 63 |
+
{"step": 1550, "loss": 1.3210445642471313, "loss_mdlm": 1.3210445642471313, "lr": 2.6401761180929797e-05, "gnorm": 0.022918788716197014, "bridge_gate_avg": 1.0091434121131897, "bridge_out_proj_avg": 0.0018086824954176943, "elapsed_s": 512.1636474132538}
|
| 64 |
+
{"step": 1575, "loss": 1.4064651727676392, "loss_mdlm": 1.4064651727676392, "lr": 2.5701156406896725e-05, "gnorm": 0.020122798159718513, "bridge_gate_avg": 1.0091225504875183, "bridge_out_proj_avg": 0.0018101085637075205, "elapsed_s": 520.5299463272095}
|
| 65 |
+
{"step": 1600, "loss": 1.336951494216919, "loss_mdlm": 1.336951494216919, "lr": 2.5e-05, "gnorm": 0.01664118841290474, "bridge_gate_avg": 1.009174644947052, "bridge_out_proj_avg": 0.0018109608984862764, "elapsed_s": 528.3040704727173}
|
| 66 |
+
{"step": 1625, "loss": 1.3356705904006958, "loss_mdlm": 1.3356705904006958, "lr": 2.429884359310328e-05, "gnorm": 0.01176365464925766, "bridge_gate_avg": 1.0091951489448547, "bridge_out_proj_avg": 0.0018145893894446392, "elapsed_s": 536.4204063415527}
|
| 67 |
+
{"step": 1650, "loss": 1.2772010564804077, "loss_mdlm": 1.2772010564804077, "lr": 2.3598238819070202e-05, "gnorm": 0.014353317208588123, "bridge_gate_avg": 1.0092175205548604, "bridge_out_proj_avg": 0.0018160992379610736, "elapsed_s": 544.5533821582794}
|
| 68 |
+
{"step": 1675, "loss": 1.303608775138855, "loss_mdlm": 1.303608775138855, "lr": 2.2898736876768815e-05, "gnorm": 0.020490305498242378, "bridge_gate_avg": 1.0092294017473857, "bridge_out_proj_avg": 0.0018192071196002264, "elapsed_s": 552.7757914066315}
|
| 69 |
+
{"step": 1700, "loss": 1.3140945434570312, "loss_mdlm": 1.3140945434570312, "lr": 2.2200888097417307e-05, "gnorm": 0.015770236030220985, "bridge_gate_avg": 1.0092209776242573, "bridge_out_proj_avg": 0.0018203265693349142, "elapsed_s": 560.887214422226}
|
| 70 |
+
{"step": 1725, "loss": 1.3260937929153442, "loss_mdlm": 1.3260937929153442, "lr": 2.1505241511612522e-05, "gnorm": 0.016024267300963402, "bridge_gate_avg": 1.0091235240300496, "bridge_out_proj_avg": 0.0018182547840600212, "elapsed_s": 569.018411397934}
|
| 71 |
+
{"step": 1750, "loss": 0.9420912861824036, "loss_mdlm": 0.9420912861824036, "lr": 2.0812344417381595e-05, "gnorm": 0.05657318979501724, "bridge_gate_avg": 1.0091320276260376, "bridge_out_proj_avg": 0.0018176289352898796, "elapsed_s": 576.8618602752686}
|
| 72 |
+
{"step": 1775, "loss": 1.3112887144088745, "loss_mdlm": 1.3112887144088745, "lr": 2.0122741949596797e-05, "gnorm": 0.027939481660723686, "bridge_gate_avg": 1.0091121395428975, "bridge_out_proj_avg": 0.0018159876926802099, "elapsed_s": 584.9445703029633}
|
| 73 |
+
{"step": 1800, "loss": 1.3370712995529175, "loss_mdlm": 1.3370712995529175, "lr": 1.9436976651092144e-05, "gnorm": 0.018038824200630188, "bridge_gate_avg": 1.009056568145752, "bridge_out_proj_avg": 0.00181624215717117, "elapsed_s": 593.2294244766235}
|
| 74 |
+
{"step": 1825, "loss": 0.870624303817749, "loss_mdlm": 0.870624303817749, "lr": 1.8755588045819327e-05, "gnorm": 0.14330516755580902, "bridge_gate_avg": 1.009046236673991, "bridge_out_proj_avg": 0.001818549334226797, "elapsed_s": 601.1885485649109}
|
| 75 |
+
{"step": 1850, "loss": 1.3985174894332886, "loss_mdlm": 1.3985174894332886, "lr": 1.8079112214378768e-05, "gnorm": 0.03189157322049141, "bridge_gate_avg": 1.009023368358612, "bridge_out_proj_avg": 0.0018211797966311376, "elapsed_s": 609.227278470993}
|
| 76 |
+
{"step": 1875, "loss": 1.3755756616592407, "loss_mdlm": 1.3755756616592407, "lr": 1.7408081372259632e-05, "gnorm": 0.019743280485272408, "bridge_gate_avg": 1.0090022285779316, "bridge_out_proj_avg": 0.0018235419411212206, "elapsed_s": 617.2483174800873}
|
| 77 |
+
{"step": 1900, "loss": 1.3390381336212158, "loss_mdlm": 1.3390381336212158, "lr": 1.6743023451120832e-05, "gnorm": 0.02038818784058094, "bridge_gate_avg": 1.0089864532152812, "bridge_out_proj_avg": 0.0018257763780032594, "elapsed_s": 625.5867650508881}
|
| 78 |
+
{"step": 1925, "loss": 1.2868070602416992, "loss_mdlm": 1.2868070602416992, "lr": 1.6084461683442176e-05, "gnorm": 0.028307119384407997, "bridge_gate_avg": 1.0090256730715434, "bridge_out_proj_avg": 0.00182673860884582, "elapsed_s": 633.874630689621}
|
| 79 |
+
{"step": 1950, "loss": 1.3487915992736816, "loss_mdlm": 1.3487915992736816, "lr": 1.5432914190872757e-05, "gnorm": 0.025399794802069664, "bridge_gate_avg": 1.0090782443682353, "bridge_out_proj_avg": 0.0018268012984966238, "elapsed_s": 642.070161819458}
|
| 80 |
+
{"step": 1975, "loss": 1.3263812065124512, "loss_mdlm": 1.3263812065124512, "lr": 1.4788893576600099e-05, "gnorm": 0.022849684581160545, "bridge_gate_avg": 1.0090962052345276, "bridge_out_proj_avg": 0.0018263027886860073, "elapsed_s": 650.1734991073608}
|
| 81 |
+
{"step": 2000, "loss": 1.3320391178131104, "loss_mdlm": 1.3320391178131104, "lr": 1.4152906522061048e-05, "gnorm": 0.026236895471811295, "bridge_gate_avg": 1.0091471076011658, "bridge_out_proj_avg": 0.001828659054202338, "elapsed_s": 658.276743888855}
|
| 82 |
+
{"step": 2025, "loss": 1.2936828136444092, "loss_mdlm": 1.2936828136444092, "lr": 1.3525453388311554e-05, "gnorm": 0.032865989953279495, "bridge_gate_avg": 1.0091828902562459, "bridge_out_proj_avg": 0.0018296883014651637, "elapsed_s": 666.1792571544647}
|
| 83 |
+
{"step": 2050, "loss": 1.3480461835861206, "loss_mdlm": 1.3480461835861206, "lr": 1.2907027822369005e-05, "gnorm": 0.03299959748983383, "bridge_gate_avg": 1.00920703013738, "bridge_out_proj_avg": 0.0018305801204405725, "elapsed_s": 674.4589033126831}
|
| 84 |
+
{"step": 2075, "loss": 1.4064806699752808, "loss_mdlm": 1.4064806699752808, "lr": 1.229811636883677e-05, "gnorm": 0.021202165633440018, "bridge_gate_avg": 1.0092559059460957, "bridge_out_proj_avg": 0.0018319566152058542, "elapsed_s": 682.5923552513123}
|
| 85 |
+
{"step": 2100, "loss": 1.382045865058899, "loss_mdlm": 1.382045865058899, "lr": 1.1699198087116589e-05, "gnorm": 0.051836688071489334, "bridge_gate_avg": 1.009281059106191, "bridge_out_proj_avg": 0.001832533278502524, "elapsed_s": 690.580602645874}
|
| 86 |
+
{"step": 2125, "loss": 1.3177062273025513, "loss_mdlm": 1.3177062273025513, "lr": 1.1110744174509952e-05, "gnorm": 0.01388181746006012, "bridge_gate_avg": 1.009259859720866, "bridge_out_proj_avg": 0.0018322995747439563, "elapsed_s": 698.6244239807129}
|
| 87 |
+
{"step": 2150, "loss": 1.4270257949829102, "loss_mdlm": 1.4270257949829102, "lr": 1.0533217595504858e-05, "gnorm": 0.03501691296696663, "bridge_gate_avg": 1.0092491308848064, "bridge_out_proj_avg": 0.0018327350650603573, "elapsed_s": 706.6638147830963}
|
| 88 |
+
{"step": 2175, "loss": 1.3691902160644531, "loss_mdlm": 1.3691902160644531, "lr": 9.967072717539851e-06, "gnorm": 0.016246715560555458, "bridge_gate_avg": 1.009278376897176, "bridge_out_proj_avg": 0.0018337658451249201, "elapsed_s": 714.93825340271}
|
| 89 |
+
{"step": 2200, "loss": 1.2727166414260864, "loss_mdlm": 1.2727166414260864, "lr": 9.412754953531663e-06, "gnorm": 0.013563881628215313, "bridge_gate_avg": 1.0092821915944417, "bridge_out_proj_avg": 0.0018352825427427888, "elapsed_s": 722.9990360736847}
|
| 90 |
+
{"step": 2225, "loss": 1.219502329826355, "loss_mdlm": 1.219502329826355, "lr": 8.870700411447816e-06, "gnorm": 0.010860474780201912, "bridge_gate_avg": 1.0093005100886028, "bridge_out_proj_avg": 0.0018357483592505257, "elapsed_s": 731.348925113678}
|
| 91 |
+
{"step": 2250, "loss": 0.8469259142875671, "loss_mdlm": 0.8469259142875671, "lr": 8.341335551199902e-06, "gnorm": 0.023038119077682495, "bridge_gate_avg": 1.009327232837677, "bridge_out_proj_avg": 0.001836015241375814, "elapsed_s": 739.5049076080322}
|
| 92 |
+
{"step": 2275, "loss": 0.9421794414520264, "loss_mdlm": 0.9421794414520264, "lr": 7.825076849127458e-06, "gnorm": 0.05124269053339958, "bridge_gate_avg": 1.0093716979026794, "bridge_out_proj_avg": 0.001836272267003854, "elapsed_s": 747.3088886737823}
|
| 93 |
+
{"step": 2300, "loss": 1.2977546453475952, "loss_mdlm": 1.2977546453475952, "lr": 7.3223304703363135e-06, "gnorm": 0.009924443438649178, "bridge_gate_avg": 1.0093627174695332, "bridge_out_proj_avg": 0.001835917471908033, "elapsed_s": 755.582980632782}
|
| 94 |
+
{"step": 2325, "loss": 1.38175368309021, "loss_mdlm": 1.38175368309021, "lr": 6.833491949149329e-06, "gnorm": 0.01593959890305996, "bridge_gate_avg": 1.0093611280123393, "bridge_out_proj_avg": 0.0018358687520958483, "elapsed_s": 763.4469780921936}
|
| 95 |
+
{"step": 2350, "loss": 1.3536111116409302, "loss_mdlm": 1.3536111116409302, "lr": 6.358945877920861e-06, "gnorm": 0.0140481386333704, "bridge_gate_avg": 1.0093359351158142, "bridge_out_proj_avg": 0.0018359972746111453, "elapsed_s": 771.5804431438446}
|
| 96 |
+
{"step": 2375, "loss": 1.2263752222061157, "loss_mdlm": 1.2263752222061157, "lr": 5.899065604459814e-06, "gnorm": 0.014901441521942616, "bridge_gate_avg": 1.009331742922465, "bridge_out_proj_avg": 0.0018363717438963552, "elapsed_s": 779.7494225502014}
|
| 97 |
+
{"step": 2400, "loss": 1.3016619682312012, "loss_mdlm": 1.3016619682312012, "lr": 5.454212938299255e-06, "gnorm": 0.012749029323458672, "bridge_gate_avg": 1.0093392133712769, "bridge_out_proj_avg": 0.0018363069005620976, "elapsed_s": 787.9335145950317}
|
| 98 |
+
{"step": 2425, "loss": 1.3060543537139893, "loss_mdlm": 1.3060543537139893, "lr": 5.02473786604378e-06, "gnorm": 0.01660013757646084, "bridge_gate_avg": 1.0093375444412231, "bridge_out_proj_avg": 0.0018355648304956655, "elapsed_s": 796.0812304019928}
|
| 99 |
+
{"step": 2450, "loss": 1.1767995357513428, "loss_mdlm": 1.1767995357513428, "lr": 4.610978276018496e-06, "gnorm": 0.01931142248213291, "bridge_gate_avg": 1.009369154771169, "bridge_out_proj_avg": 0.0018360138831970592, "elapsed_s": 804.2033357620239}
|
| 100 |
+
{"step": 2475, "loss": 1.3536555767059326, "loss_mdlm": 1.3536555767059326, "lr": 4.213259692436367e-06, "gnorm": 0.03864279389381409, "bridge_gate_avg": 1.0093536575635274, "bridge_out_proj_avg": 0.0018359822764371831, "elapsed_s": 812.3306396007538}
|
| 101 |
+
{"step": 2500, "loss": 1.2895066738128662, "loss_mdlm": 1.2895066738128662, "lr": 3.831895019292897e-06, "gnorm": 0.023383906111121178, "bridge_gate_avg": 1.0093478560447693, "bridge_out_proj_avg": 0.0018356950022280216, "elapsed_s": 820.5304939746857}
|
| 102 |
+
{"step": 2525, "loss": 1.4101202487945557, "loss_mdlm": 1.4101202487945557, "lr": 3.4671842941897765e-06, "gnorm": 0.020931892096996307, "bridge_gate_avg": 1.0093289613723755, "bridge_out_proj_avg": 0.001835638036330541, "elapsed_s": 828.7419180870056}
|
| 103 |
+
{"step": 2550, "loss": 1.3141366243362427, "loss_mdlm": 1.3141366243362427, "lr": 3.119414452281158e-06, "gnorm": 0.03397688642144203, "bridge_gate_avg": 1.0093432466189067, "bridge_out_proj_avg": 0.0018359561800025403, "elapsed_s": 836.8825550079346}
|
| 104 |
+
{"step": 2575, "loss": 1.363105297088623, "loss_mdlm": 1.363105297088623, "lr": 2.788859100528196e-06, "gnorm": 0.01564246229827404, "bridge_gate_avg": 1.0093525846799214, "bridge_out_proj_avg": 0.0018361059677166243, "elapsed_s": 844.9581370353699}
|
| 105 |
+
{"step": 2600, "loss": 1.3352038860321045, "loss_mdlm": 1.3352038860321045, "lr": 2.475778302439524e-06, "gnorm": 0.03751305118203163, "bridge_gate_avg": 1.0093469222386677, "bridge_out_proj_avg": 0.0018360152607783675, "elapsed_s": 853.1731915473938}
|
| 106 |
+
{"step": 2625, "loss": 1.353458285331726, "loss_mdlm": 1.353458285331726, "lr": 2.1804183734670277e-06, "gnorm": 0.02104584500193596, "bridge_gate_avg": 1.009345571200053, "bridge_out_proj_avg": 0.0018359886986824374, "elapsed_s": 861.3044574260712}
|
| 107 |
+
{"step": 2650, "loss": 1.344861626625061, "loss_mdlm": 1.344861626625061, "lr": 1.9030116872178316e-06, "gnorm": 0.026171674951910973, "bridge_gate_avg": 1.009350061416626, "bridge_out_proj_avg": 0.0018358287246276934, "elapsed_s": 869.4215621948242}
|
| 108 |
+
{"step": 2675, "loss": 1.3488152027130127, "loss_mdlm": 1.3488152027130127, "lr": 1.6437764926350074e-06, "gnorm": 0.026766804978251457, "bridge_gate_avg": 1.0093571941057842, "bridge_out_proj_avg": 0.0018357226896720629, "elapsed_s": 877.7115664482117}
|
| 109 |
+
{"step": 2700, "loss": 1.3082574605941772, "loss_mdlm": 1.3082574605941772, "lr": 1.4029167422908107e-06, "gnorm": 0.015456887893378735, "bridge_gate_avg": 1.0093584855397542, "bridge_out_proj_avg": 0.001835465373005718, "elapsed_s": 885.7149596214294}
|
| 110 |
+
{"step": 2725, "loss": 1.3948131799697876, "loss_mdlm": 1.3948131799697876, "lr": 1.180621931927592e-06, "gnorm": 0.015920039266347885, "bridge_gate_avg": 1.0093626777331035, "bridge_out_proj_avg": 0.0018354317095751564, "elapsed_s": 892.8659901618958}
|
| 111 |
+
{"step": 2750, "loss": 1.3130736351013184, "loss_mdlm": 1.3130736351013184, "lr": 9.770669513725128e-07, "gnorm": 0.011319499462842941, "bridge_gate_avg": 1.0093612273534138, "bridge_out_proj_avg": 0.0018354119383729994, "elapsed_s": 899.0790481567383}
|
| 112 |
+
{"step": 2775, "loss": 1.3538495302200317, "loss_mdlm": 1.3538495302200317, "lr": 7.924119469434665e-07, "gnorm": 0.03895651176571846, "bridge_gate_avg": 1.0093609690666199, "bridge_out_proj_avg": 0.0018353445533042152, "elapsed_s": 905.4641127586365}
|
| 113 |
+
{"step": 2800, "loss": 1.3550598621368408, "loss_mdlm": 1.3550598621368408, "lr": 6.268021954544096e-07, "gnorm": 0.020594652742147446, "bridge_gate_avg": 1.0093639492988586, "bridge_out_proj_avg": 0.0018353323296954234, "elapsed_s": 913.3024325370789}
|
| 114 |
+
{"step": 2825, "loss": 1.3313506841659546, "loss_mdlm": 1.3313506841659546, "lr": 4.803679899192392e-07, "gnorm": 0.01210678182542324, "bridge_gate_avg": 1.009363830089569, "bridge_out_proj_avg": 0.0018353293999098241, "elapsed_s": 920.6820011138916}
|
| 115 |
+
{"step": 2850, "loss": 1.3164628744125366, "loss_mdlm": 1.3164628744125366, "lr": 3.5322453704410286e-07, "gnorm": 0.030071541666984558, "bridge_gate_avg": 1.0093642473220825, "bridge_out_proj_avg": 0.0018353470756361883, "elapsed_s": 926.9379017353058}
|
| 116 |
+
{"step": 2875, "loss": 1.2549861669540405, "loss_mdlm": 1.2549861669540405, "lr": 2.454718665888589e-07, "gnorm": 0.027844054624438286, "bridge_gate_avg": 1.0093650023142497, "bridge_out_proj_avg": 0.0018353444756940007, "elapsed_s": 933.2061822414398}
|
| 117 |
+
{"step": 2900, "loss": 1.3108859062194824, "loss_mdlm": 1.3108859062194824, "lr": 1.571947526689349e-07, "gnorm": 0.015730110928416252, "bridge_gate_avg": 1.0093648632367451, "bridge_out_proj_avg": 0.0018353513830030959, "elapsed_s": 939.757490158081}
|
| 118 |
+
{"step": 2925, "loss": 1.3160102367401123, "loss_mdlm": 1.3160102367401123, "lr": 8.846264705952289e-08, "gnorm": 0.021719852462410927, "bridge_gate_avg": 1.0093648632367451, "bridge_out_proj_avg": 0.0018353546232295532, "elapsed_s": 946.0379848480225}
|
| 119 |
+
{"step": 2950, "loss": 0.8860926628112793, "loss_mdlm": 0.8860926628112793, "lr": 3.9329624554584884e-08, "gnorm": 0.01663123443722725, "bridge_gate_avg": 1.0093648632367451, "bridge_out_proj_avg": 0.001835352120300134, "elapsed_s": 952.2641079425812}
|
| 120 |
+
{"step": 2975, "loss": 1.337233304977417, "loss_mdlm": 1.337233304977417, "lr": 9.834340423678368e-09, "gnorm": 0.02532481588423252, "bridge_gate_avg": 1.0093648632367451, "bridge_out_proj_avg": 0.0018353515382235248, "elapsed_s": 958.4577474594116}
|