diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13025 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.647834234848162, + "global_step": 500001, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "FLOPS loss": 5.392782622948289e-05, + "L0_d": 15092.36, + "MLM loss": 8.760412216186523, + "epoch": 0.01, + "step": 499 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "loss": 9.6927, + "step": 500 + }, + { + "FLOPS loss": 0.0008161979494616389, + "L0_d": 27461.33, + "MLM loss": 7.4083781242370605, + "epoch": 0.02, + "step": 999 + }, + { + "epoch": 0.02, + "learning_rate": 1e-05, + "loss": 7.9871, + "step": 1000 + }, + { + "FLOPS loss": 0.0027657910250127316, + "L0_d": 29453.2, + "MLM loss": 6.897146224975586, + "epoch": 0.03, + "step": 1499 + }, + { + "epoch": 0.03, + "learning_rate": 1.5e-05, + "loss": 7.0651, + "step": 1500 + }, + { + "FLOPS loss": 0.005620635114610195, + "L0_d": 30316.53, + "MLM loss": 6.6747026443481445, + "epoch": 0.04, + "step": 1999 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 6.7723, + "step": 2000 + }, + { + "FLOPS loss": 0.010232314467430115, + "L0_d": 30685.58, + "MLM loss": 6.474103927612305, + "epoch": 0.05, + "step": 2499 + }, + { + "epoch": 0.05, + "learning_rate": 2.5e-05, + "loss": 6.615, + "step": 2500 + }, + { + "FLOPS loss": 0.01670052483677864, + "L0_d": 31151.2, + "MLM loss": 6.448917388916016, + "epoch": 0.06, + "step": 2999 + }, + { + "epoch": 0.06, + "learning_rate": 3e-05, + "loss": 6.487, + "step": 3000 + }, + { + "FLOPS loss": 0.022464273497462273, + "L0_d": 30915.17, + "MLM loss": 5.781277179718018, + "epoch": 0.07, + "step": 3499 + }, + { + "epoch": 0.07, + "learning_rate": 3.5e-05, + "loss": 6.1663, + "step": 3500 + }, + { + "FLOPS loss": 0.029723184183239937, + "L0_d": 30589.05, + "MLM loss": 5.28702449798584, + "epoch": 0.09, + "step": 3999 + }, + { + "epoch": 0.09, + "learning_rate": 4e-05, + "loss": 5.5923, + "step": 4000 + }, + { + "FLOPS loss": 0.0326639786362648, + "L0_d": 28512.73, + "MLM loss": 4.979520797729492, + "epoch": 0.1, + "step": 4499 + }, + { + "epoch": 0.1, + "learning_rate": 4.5e-05, + "loss": 5.0652, + "step": 4500 + }, + { + "FLOPS loss": 0.03443261235952377, + "L0_d": 25869.95, + "MLM loss": 4.665562629699707, + "epoch": 0.11, + "step": 4999 + }, + { + "epoch": 0.11, + "learning_rate": 5e-05, + "loss": 4.7192, + "step": 5000 + }, + { + "FLOPS loss": 0.03417317196726799, + "L0_d": 23801.28, + "MLM loss": 4.246782302856445, + "epoch": 0.12, + "step": 5499 + }, + { + "epoch": 0.12, + "learning_rate": 5.500000000000001e-05, + "loss": 4.4723, + "step": 5500 + }, + { + "FLOPS loss": 0.031081825494766235, + "L0_d": 19494.3, + "MLM loss": 4.069106101989746, + "epoch": 0.13, + "step": 5999 + }, + { + "epoch": 0.13, + "learning_rate": 6e-05, + "loss": 4.2738, + "step": 6000 + }, + { + "FLOPS loss": 0.028846798464655876, + "L0_d": 16482.78, + "MLM loss": 3.9512133598327637, + "epoch": 0.14, + "step": 6499 + }, + { + "epoch": 0.14, + "learning_rate": 6.500000000000001e-05, + "loss": 4.1219, + "step": 6500 + }, + { + "FLOPS loss": 0.02576448954641819, + "L0_d": 13962.62, + "MLM loss": 3.9717659950256348, + "epoch": 0.15, + "step": 6999 + }, + { + "epoch": 0.15, + "learning_rate": 7e-05, + "loss": 4.0019, + "step": 7000 + }, + { + "FLOPS loss": 0.03054131008684635, + "L0_d": 14436.58, + "MLM loss": 3.9484663009643555, + "epoch": 0.16, + "step": 7499 + }, + { + "epoch": 0.16, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8882, + "step": 7500 + }, + { + "FLOPS loss": 0.027292482554912567, + "L0_d": 12377.77, + "MLM loss": 3.9403085708618164, + "epoch": 0.17, + "step": 7999 + }, + { + "epoch": 0.17, + "learning_rate": 8e-05, + "loss": 3.7931, + "step": 8000 + }, + { + "FLOPS loss": 0.02762710675597191, + "L0_d": 10414.95, + "MLM loss": 3.5601937770843506, + "epoch": 0.18, + "step": 8499 + }, + { + "epoch": 0.18, + "learning_rate": 8.5e-05, + "loss": 3.6941, + "step": 8500 + }, + { + "FLOPS loss": 0.02418929897248745, + "L0_d": 8721.27, + "MLM loss": 3.5979204177856445, + "epoch": 0.19, + "step": 8999 + }, + { + "epoch": 0.19, + "learning_rate": 9e-05, + "loss": 3.5589, + "step": 9000 + }, + { + "FLOPS loss": 0.02961421199142933, + "L0_d": 9450.05, + "MLM loss": 3.2906453609466553, + "epoch": 0.2, + "step": 9499 + }, + { + "epoch": 0.2, + "learning_rate": 9.5e-05, + "loss": 3.4316, + "step": 9500 + }, + { + "FLOPS loss": 0.02749692089855671, + "L0_d": 7999.8, + "MLM loss": 3.2580208778381348, + "epoch": 0.21, + "step": 9999 + }, + { + "epoch": 0.21, + "learning_rate": 9.999000000000001e-05, + "loss": 3.334, + "step": 10000 + }, + { + "FLOPS loss": 0.02673165127635002, + "L0_d": 7029.53, + "MLM loss": 3.250258684158325, + "epoch": 0.22, + "step": 10499 + }, + { + "epoch": 0.22, + "learning_rate": 9.989816326530613e-05, + "loss": 3.2474, + "step": 10500 + }, + { + "FLOPS loss": 0.027633745223283768, + "L0_d": 7096.09, + "MLM loss": 3.3386759757995605, + "epoch": 0.23, + "step": 10999 + }, + { + "epoch": 0.23, + "learning_rate": 9.97961224489796e-05, + "loss": 3.1775, + "step": 11000 + }, + { + "FLOPS loss": 0.026252513751387596, + "L0_d": 6960.83, + "MLM loss": 3.018740177154541, + "epoch": 0.24, + "step": 11499 + }, + { + "epoch": 0.24, + "learning_rate": 9.969408163265307e-05, + "loss": 3.119, + "step": 11500 + }, + { + "FLOPS loss": 0.018239092081785202, + "L0_d": 5148.5, + "MLM loss": 3.0682997703552246, + "epoch": 0.26, + "step": 11999 + }, + { + "epoch": 0.26, + "learning_rate": 9.959204081632653e-05, + "loss": 3.0649, + "step": 12000 + }, + { + "FLOPS loss": 0.023073602467775345, + "L0_d": 5836.48, + "MLM loss": 3.1717731952667236, + "epoch": 0.27, + "step": 12499 + }, + { + "epoch": 0.27, + "learning_rate": 9.949020408163265e-05, + "loss": 3.017, + "step": 12500 + }, + { + "FLOPS loss": 0.024667827412486076, + "L0_d": 5631.75, + "MLM loss": 2.7986183166503906, + "epoch": 0.28, + "step": 12999 + }, + { + "epoch": 0.28, + "learning_rate": 9.938816326530612e-05, + "loss": 2.9808, + "step": 13000 + }, + { + "FLOPS loss": 0.023629698902368546, + "L0_d": 5088.77, + "MLM loss": 2.7454609870910645, + "epoch": 0.29, + "step": 13499 + }, + { + "epoch": 0.29, + "learning_rate": 9.92861224489796e-05, + "loss": 2.9405, + "step": 13500 + }, + { + "FLOPS loss": 0.024097440764307976, + "L0_d": 5203.92, + "MLM loss": 2.75523042678833, + "epoch": 0.3, + "step": 13999 + }, + { + "epoch": 0.3, + "learning_rate": 9.918408163265308e-05, + "loss": 2.9051, + "step": 14000 + }, + { + "FLOPS loss": 0.03014693595468998, + "L0_d": 5187.09, + "MLM loss": 2.7530577182769775, + "epoch": 0.31, + "step": 14499 + }, + { + "epoch": 0.31, + "learning_rate": 9.908224489795918e-05, + "loss": 2.8747, + "step": 14500 + }, + { + "FLOPS loss": 0.024567320942878723, + "L0_d": 4308.94, + "MLM loss": 2.730717182159424, + "epoch": 0.32, + "step": 14999 + }, + { + "epoch": 0.32, + "learning_rate": 9.898020408163266e-05, + "loss": 2.8491, + "step": 15000 + }, + { + "FLOPS loss": 0.025582976639270782, + "L0_d": 5050.28, + "MLM loss": 3.0116822719573975, + "epoch": 0.33, + "step": 15499 + }, + { + "epoch": 0.33, + "learning_rate": 9.887816326530613e-05, + "loss": 2.824, + "step": 15500 + }, + { + "FLOPS loss": 0.02788759395480156, + "L0_d": 4436.11, + "MLM loss": 2.500883102416992, + "epoch": 0.34, + "step": 15999 + }, + { + "epoch": 0.34, + "learning_rate": 9.877612244897959e-05, + "loss": 2.7988, + "step": 16000 + }, + { + "FLOPS loss": 0.033916205167770386, + "L0_d": 4878.62, + "MLM loss": 2.69112229347229, + "epoch": 0.35, + "step": 16499 + }, + { + "epoch": 0.35, + "learning_rate": 9.867428571428572e-05, + "loss": 2.7721, + "step": 16500 + }, + { + "FLOPS loss": 0.026273198425769806, + "L0_d": 4141.28, + "MLM loss": 2.696350336074829, + "epoch": 0.36, + "step": 16999 + }, + { + "epoch": 0.36, + "learning_rate": 9.857224489795919e-05, + "loss": 2.7496, + "step": 17000 + }, + { + "FLOPS loss": 0.02965918742120266, + "L0_d": 3847.95, + "MLM loss": 2.7344136238098145, + "epoch": 0.37, + "step": 17499 + }, + { + "epoch": 0.37, + "learning_rate": 9.847020408163265e-05, + "loss": 2.7262, + "step": 17500 + }, + { + "FLOPS loss": 0.027983248233795166, + "L0_d": 3737.28, + "MLM loss": 2.6528053283691406, + "epoch": 0.38, + "step": 17999 + }, + { + "epoch": 0.38, + "learning_rate": 9.836816326530612e-05, + "loss": 2.7069, + "step": 18000 + }, + { + "FLOPS loss": 0.03850917890667915, + "L0_d": 4032.44, + "MLM loss": 2.837846279144287, + "epoch": 0.39, + "step": 18499 + }, + { + "epoch": 0.39, + "learning_rate": 9.826632653061225e-05, + "loss": 2.6941, + "step": 18500 + }, + { + "FLOPS loss": 0.03680583834648132, + "L0_d": 4449.95, + "MLM loss": 2.7556967735290527, + "epoch": 0.4, + "step": 18999 + }, + { + "epoch": 0.4, + "learning_rate": 9.816428571428572e-05, + "loss": 2.6743, + "step": 19000 + }, + { + "FLOPS loss": 0.027939151972532272, + "L0_d": 2543.42, + "MLM loss": 2.738865375518799, + "epoch": 0.42, + "step": 19499 + }, + { + "epoch": 0.42, + "learning_rate": 9.806224489795918e-05, + "loss": 2.6548, + "step": 19500 + }, + { + "FLOPS loss": 0.03628528118133545, + "L0_d": 3611.06, + "MLM loss": 2.7977004051208496, + "epoch": 0.43, + "step": 19999 + }, + { + "epoch": 0.43, + "learning_rate": 9.796020408163266e-05, + "loss": 2.6434, + "step": 20000 + }, + { + "FLOPS loss": 0.0318717323243618, + "L0_d": 3071.55, + "MLM loss": 2.655547618865967, + "epoch": 0.44, + "step": 20499 + }, + { + "epoch": 0.44, + "learning_rate": 9.785836734693878e-05, + "loss": 2.6301, + "step": 20500 + }, + { + "FLOPS loss": 0.039533913135528564, + "L0_d": 3564.52, + "MLM loss": 2.4931273460388184, + "epoch": 0.45, + "step": 20999 + }, + { + "epoch": 0.45, + "learning_rate": 9.77565306122449e-05, + "loss": 2.6197, + "step": 21000 + }, + { + "FLOPS loss": 0.03401036560535431, + "L0_d": 3066.97, + "MLM loss": 2.598534107208252, + "epoch": 0.46, + "step": 21499 + }, + { + "epoch": 0.46, + "learning_rate": 9.765448979591837e-05, + "loss": 2.6015, + "step": 21500 + }, + { + "FLOPS loss": 0.032060861587524414, + "L0_d": 2761.09, + "MLM loss": 2.5144989490509033, + "epoch": 0.47, + "step": 21999 + }, + { + "epoch": 0.47, + "learning_rate": 9.755244897959183e-05, + "loss": 2.5921, + "step": 22000 + }, + { + "FLOPS loss": 0.03773064538836479, + "L0_d": 2863.8, + "MLM loss": 2.4851295948028564, + "epoch": 0.48, + "step": 22499 + }, + { + "epoch": 0.48, + "learning_rate": 9.745040816326531e-05, + "loss": 2.5806, + "step": 22500 + }, + { + "FLOPS loss": 0.03615275397896767, + "L0_d": 2681.7, + "MLM loss": 2.422513484954834, + "epoch": 0.49, + "step": 22999 + }, + { + "epoch": 0.49, + "learning_rate": 9.734836734693879e-05, + "loss": 2.5702, + "step": 23000 + }, + { + "FLOPS loss": 0.04175760596990585, + "L0_d": 2752.3, + "MLM loss": 2.5466079711914062, + "epoch": 0.5, + "step": 23499 + }, + { + "epoch": 0.5, + "learning_rate": 9.724632653061225e-05, + "loss": 2.5621, + "step": 23500 + }, + { + "FLOPS loss": 0.04358026012778282, + "L0_d": 3140.41, + "MLM loss": 2.4925756454467773, + "epoch": 0.51, + "step": 23999 + }, + { + "epoch": 0.51, + "learning_rate": 9.714428571428572e-05, + "loss": 2.5497, + "step": 24000 + }, + { + "FLOPS loss": 0.03997480496764183, + "L0_d": 2636.91, + "MLM loss": 2.4403679370880127, + "epoch": 0.52, + "step": 24499 + }, + { + "epoch": 0.52, + "learning_rate": 9.70422448979592e-05, + "loss": 2.5387, + "step": 24500 + }, + { + "FLOPS loss": 0.04443227872252464, + "L0_d": 2863.91, + "MLM loss": 2.702801465988159, + "epoch": 0.53, + "step": 24999 + }, + { + "epoch": 0.53, + "learning_rate": 9.694061224489797e-05, + "loss": 2.5299, + "step": 25000 + }, + { + "FLOPS loss": 0.03630707040429115, + "L0_d": 1869.66, + "MLM loss": 2.500286102294922, + "epoch": 0.54, + "step": 25499 + }, + { + "epoch": 0.54, + "learning_rate": 9.683857142857144e-05, + "loss": 2.5238, + "step": 25500 + }, + { + "FLOPS loss": 0.045446157455444336, + "L0_d": 2295.62, + "MLM loss": 2.675072193145752, + "epoch": 0.55, + "step": 25999 + }, + { + "epoch": 0.55, + "learning_rate": 9.67365306122449e-05, + "loss": 2.5185, + "step": 26000 + }, + { + "FLOPS loss": 0.03888548165559769, + "L0_d": 2242.59, + "MLM loss": 2.5834720134735107, + "epoch": 0.56, + "step": 26499 + }, + { + "epoch": 0.56, + "learning_rate": 9.663448979591837e-05, + "loss": 2.5038, + "step": 26500 + }, + { + "FLOPS loss": 0.04285123944282532, + "L0_d": 2071.3, + "MLM loss": 2.2703781127929688, + "epoch": 0.57, + "step": 26999 + }, + { + "epoch": 0.57, + "learning_rate": 9.653244897959184e-05, + "loss": 2.5001, + "step": 27000 + }, + { + "FLOPS loss": 0.04465385898947716, + "L0_d": 2078.45, + "MLM loss": 2.442018985748291, + "epoch": 0.59, + "step": 27499 + }, + { + "epoch": 0.59, + "learning_rate": 9.643040816326531e-05, + "loss": 2.4859, + "step": 27500 + }, + { + "FLOPS loss": 0.05151209235191345, + "L0_d": 2542.83, + "MLM loss": 2.4004836082458496, + "epoch": 0.6, + "step": 27999 + }, + { + "epoch": 0.6, + "learning_rate": 9.632836734693877e-05, + "loss": 2.4844, + "step": 28000 + }, + { + "FLOPS loss": 0.04146797955036163, + "L0_d": 2260.97, + "MLM loss": 2.3301563262939453, + "epoch": 0.61, + "step": 28499 + }, + { + "epoch": 0.61, + "learning_rate": 9.622653061224491e-05, + "loss": 2.4795, + "step": 28500 + }, + { + "FLOPS loss": 0.040446631610393524, + "L0_d": 1544.88, + "MLM loss": 2.348306179046631, + "epoch": 0.62, + "step": 28999 + }, + { + "epoch": 0.62, + "learning_rate": 9.612448979591837e-05, + "loss": 2.4705, + "step": 29000 + }, + { + "FLOPS loss": 0.048661969602108, + "L0_d": 1644.22, + "MLM loss": 2.352621555328369, + "epoch": 0.63, + "step": 29499 + }, + { + "epoch": 0.63, + "learning_rate": 9.602244897959184e-05, + "loss": 2.4636, + "step": 29500 + }, + { + "FLOPS loss": 0.04659537598490715, + "L0_d": 2116.88, + "MLM loss": 2.5809438228607178, + "epoch": 0.64, + "step": 29999 + }, + { + "epoch": 0.64, + "learning_rate": 9.59204081632653e-05, + "loss": 2.4569, + "step": 30000 + }, + { + "FLOPS loss": 0.04385443776845932, + "L0_d": 1462.5, + "MLM loss": 2.521130323410034, + "epoch": 0.65, + "step": 30499 + }, + { + "epoch": 0.65, + "learning_rate": 9.581836734693878e-05, + "loss": 2.4556, + "step": 30500 + }, + { + "FLOPS loss": 0.05597566068172455, + "L0_d": 1834.39, + "MLM loss": 2.3147106170654297, + "epoch": 0.66, + "step": 30999 + }, + { + "epoch": 0.66, + "learning_rate": 9.57165306122449e-05, + "loss": 2.4486, + "step": 31000 + }, + { + "FLOPS loss": 0.05618233606219292, + "L0_d": 1604.23, + "MLM loss": 2.6163783073425293, + "epoch": 0.67, + "step": 31499 + }, + { + "epoch": 0.67, + "learning_rate": 9.561448979591837e-05, + "loss": 2.4394, + "step": 31500 + }, + { + "FLOPS loss": 0.056978415697813034, + "L0_d": 1700.08, + "MLM loss": 2.268538236618042, + "epoch": 0.68, + "step": 31999 + }, + { + "epoch": 0.68, + "learning_rate": 9.551244897959184e-05, + "loss": 2.4394, + "step": 32000 + }, + { + "FLOPS loss": 0.05571691691875458, + "L0_d": 1859.88, + "MLM loss": 2.3873777389526367, + "epoch": 0.69, + "step": 32499 + }, + { + "epoch": 0.69, + "learning_rate": 9.541040816326531e-05, + "loss": 2.4321, + "step": 32500 + }, + { + "FLOPS loss": 0.058303989470005035, + "L0_d": 1647.94, + "MLM loss": 2.3315484523773193, + "epoch": 0.7, + "step": 32999 + }, + { + "epoch": 0.7, + "learning_rate": 9.530857142857144e-05, + "loss": 2.4282, + "step": 33000 + }, + { + "FLOPS loss": 0.05903186276555061, + "L0_d": 1568.08, + "MLM loss": 2.4503281116485596, + "epoch": 0.71, + "step": 33499 + }, + { + "epoch": 0.71, + "learning_rate": 9.520653061224491e-05, + "loss": 2.4228, + "step": 33500 + }, + { + "FLOPS loss": 0.05316786840558052, + "L0_d": 1538.89, + "MLM loss": 2.356748104095459, + "epoch": 0.72, + "step": 33999 + }, + { + "epoch": 0.72, + "learning_rate": 9.510448979591837e-05, + "loss": 2.4234, + "step": 34000 + }, + { + "FLOPS loss": 0.053176648914813995, + "L0_d": 1371.69, + "MLM loss": 2.461960554122925, + "epoch": 0.73, + "step": 34499 + }, + { + "epoch": 0.73, + "learning_rate": 9.500244897959184e-05, + "loss": 2.4175, + "step": 34500 + }, + { + "FLOPS loss": 0.05513223633170128, + "L0_d": 1471.97, + "MLM loss": 2.52297306060791, + "epoch": 0.75, + "step": 34999 + }, + { + "epoch": 0.75, + "learning_rate": 9.490040816326531e-05, + "loss": 2.4137, + "step": 35000 + }, + { + "FLOPS loss": 0.06698770076036453, + "L0_d": 1671.09, + "MLM loss": 2.314728021621704, + "epoch": 0.76, + "step": 35499 + }, + { + "epoch": 0.76, + "learning_rate": 9.479857142857144e-05, + "loss": 2.4086, + "step": 35500 + }, + { + "FLOPS loss": 0.054937928915023804, + "L0_d": 1281.67, + "MLM loss": 2.2821147441864014, + "epoch": 0.77, + "step": 35999 + }, + { + "epoch": 0.77, + "learning_rate": 9.46965306122449e-05, + "loss": 2.4029, + "step": 36000 + }, + { + "FLOPS loss": 0.06599493324756622, + "L0_d": 1534.23, + "MLM loss": 2.372270345687866, + "epoch": 0.78, + "step": 36499 + }, + { + "epoch": 0.78, + "learning_rate": 9.459448979591838e-05, + "loss": 2.3998, + "step": 36500 + }, + { + "FLOPS loss": 0.0572284571826458, + "L0_d": 1141.72, + "MLM loss": 2.2006995677948, + "epoch": 0.79, + "step": 36999 + }, + { + "epoch": 0.79, + "learning_rate": 9.449244897959184e-05, + "loss": 2.3925, + "step": 37000 + }, + { + "FLOPS loss": 0.0742960199713707, + "L0_d": 1721.53, + "MLM loss": 2.2286112308502197, + "epoch": 0.8, + "step": 37499 + }, + { + "epoch": 0.8, + "learning_rate": 9.439061224489796e-05, + "loss": 2.3937, + "step": 37500 + }, + { + "FLOPS loss": 0.057512883096933365, + "L0_d": 916.3, + "MLM loss": 2.375272750854492, + "epoch": 0.81, + "step": 37999 + }, + { + "epoch": 0.81, + "learning_rate": 9.428857142857144e-05, + "loss": 2.3879, + "step": 38000 + }, + { + "FLOPS loss": 0.07384462654590607, + "L0_d": 1570.23, + "MLM loss": 2.320944309234619, + "epoch": 0.82, + "step": 38499 + }, + { + "epoch": 0.82, + "learning_rate": 9.41865306122449e-05, + "loss": 2.386, + "step": 38500 + }, + { + "FLOPS loss": 0.05838833004236221, + "L0_d": 976.95, + "MLM loss": 2.330425262451172, + "epoch": 0.83, + "step": 38999 + }, + { + "epoch": 0.83, + "learning_rate": 9.408448979591837e-05, + "loss": 2.3854, + "step": 39000 + }, + { + "FLOPS loss": 0.06257197260856628, + "L0_d": 1007.91, + "MLM loss": 2.2275640964508057, + "epoch": 0.84, + "step": 39499 + }, + { + "epoch": 0.84, + "learning_rate": 9.398265306122449e-05, + "loss": 2.3766, + "step": 39500 + }, + { + "FLOPS loss": 0.06444195657968521, + "L0_d": 1197.8, + "MLM loss": 2.4724812507629395, + "epoch": 0.85, + "step": 39999 + }, + { + "epoch": 0.85, + "learning_rate": 9.388061224489796e-05, + "loss": 2.3743, + "step": 40000 + }, + { + "FLOPS loss": 0.07541827857494354, + "L0_d": 1409.95, + "MLM loss": 2.302579402923584, + "epoch": 0.86, + "step": 40499 + }, + { + "epoch": 0.86, + "learning_rate": 9.377857142857144e-05, + "loss": 2.3769, + "step": 40500 + }, + { + "FLOPS loss": 0.08086178451776505, + "L0_d": 1100.89, + "MLM loss": 2.3238229751586914, + "epoch": 0.87, + "step": 40999 + }, + { + "epoch": 0.87, + "learning_rate": 9.36765306122449e-05, + "loss": 2.3687, + "step": 41000 + }, + { + "FLOPS loss": 0.07856693863868713, + "L0_d": 1270.05, + "MLM loss": 2.2399253845214844, + "epoch": 0.88, + "step": 41499 + }, + { + "epoch": 0.88, + "learning_rate": 9.357448979591838e-05, + "loss": 2.3666, + "step": 41500 + }, + { + "FLOPS loss": 0.07329129427671432, + "L0_d": 1334.06, + "MLM loss": 2.3387813568115234, + "epoch": 0.89, + "step": 41999 + }, + { + "epoch": 0.89, + "learning_rate": 9.347265306122449e-05, + "loss": 2.3618, + "step": 42000 + }, + { + "FLOPS loss": 0.07289435714483261, + "L0_d": 1140.52, + "MLM loss": 2.252929449081421, + "epoch": 0.91, + "step": 42499 + }, + { + "epoch": 0.91, + "learning_rate": 9.337061224489796e-05, + "loss": 2.3592, + "step": 42500 + }, + { + "FLOPS loss": 0.07385888695716858, + "L0_d": 1153.45, + "MLM loss": 2.245168924331665, + "epoch": 0.92, + "step": 42999 + }, + { + "epoch": 0.92, + "learning_rate": 9.326857142857144e-05, + "loss": 2.3607, + "step": 43000 + }, + { + "FLOPS loss": 0.07996492087841034, + "L0_d": 1155.66, + "MLM loss": 2.101062536239624, + "epoch": 0.93, + "step": 43499 + }, + { + "epoch": 0.93, + "learning_rate": 9.31665306122449e-05, + "loss": 2.3536, + "step": 43500 + }, + { + "FLOPS loss": 0.07473205775022507, + "L0_d": 1201.67, + "MLM loss": 2.2482810020446777, + "epoch": 0.94, + "step": 43999 + }, + { + "epoch": 0.94, + "learning_rate": 9.306469387755103e-05, + "loss": 2.3592, + "step": 44000 + }, + { + "FLOPS loss": 0.0637981966137886, + "L0_d": 694.08, + "MLM loss": 2.2862000465393066, + "epoch": 0.95, + "step": 44499 + }, + { + "epoch": 0.95, + "learning_rate": 9.296265306122449e-05, + "loss": 2.3556, + "step": 44500 + }, + { + "FLOPS loss": 0.07323095202445984, + "L0_d": 1020.36, + "MLM loss": 2.3087589740753174, + "epoch": 0.96, + "step": 44999 + }, + { + "epoch": 0.96, + "learning_rate": 9.286061224489796e-05, + "loss": 2.3506, + "step": 45000 + }, + { + "FLOPS loss": 0.06839210540056229, + "L0_d": 737.48, + "MLM loss": 2.2693538665771484, + "epoch": 0.97, + "step": 45499 + }, + { + "epoch": 0.97, + "learning_rate": 9.275857142857143e-05, + "loss": 2.3474, + "step": 45500 + }, + { + "FLOPS loss": 0.058115143328905106, + "L0_d": 811.34, + "MLM loss": 2.4099068641662598, + "epoch": 0.98, + "step": 45999 + }, + { + "epoch": 0.98, + "learning_rate": 9.265653061224491e-05, + "loss": 2.3457, + "step": 46000 + }, + { + "FLOPS loss": 0.07992670685052872, + "L0_d": 1060.0, + "MLM loss": 2.431608200073242, + "epoch": 0.99, + "step": 46499 + }, + { + "epoch": 0.99, + "learning_rate": 9.255469387755102e-05, + "loss": 2.3416, + "step": 46500 + }, + { + "FLOPS loss": 0.07746997475624084, + "L0_d": 863.0, + "MLM loss": 2.2116293907165527, + "epoch": 1.0, + "step": 46999 + }, + { + "epoch": 1.0, + "learning_rate": 9.24526530612245e-05, + "loss": 2.3453, + "step": 47000 + }, + { + "FLOPS loss": 0.06712145358324051, + "L0_d": 780.94, + "MLM loss": 2.144688844680786, + "epoch": 1.01, + "step": 47499 + }, + { + "epoch": 1.01, + "learning_rate": 9.235061224489796e-05, + "loss": 2.3361, + "step": 47500 + }, + { + "FLOPS loss": 0.07461294531822205, + "L0_d": 843.0, + "MLM loss": 2.273952007293701, + "epoch": 1.02, + "step": 47999 + }, + { + "epoch": 1.02, + "learning_rate": 9.224857142857143e-05, + "loss": 2.339, + "step": 48000 + }, + { + "FLOPS loss": 0.06439612060785294, + "L0_d": 755.41, + "MLM loss": 2.1934714317321777, + "epoch": 1.03, + "step": 48499 + }, + { + "epoch": 1.03, + "learning_rate": 9.21465306122449e-05, + "loss": 2.3308, + "step": 48500 + }, + { + "FLOPS loss": 0.08083415776491165, + "L0_d": 686.83, + "MLM loss": 2.324183940887451, + "epoch": 1.04, + "step": 48999 + }, + { + "epoch": 1.04, + "learning_rate": 9.204469387755103e-05, + "loss": 2.3345, + "step": 49000 + }, + { + "FLOPS loss": 0.0857621356844902, + "L0_d": 888.97, + "MLM loss": 2.316976547241211, + "epoch": 1.05, + "step": 49499 + }, + { + "epoch": 1.05, + "learning_rate": 9.194265306122449e-05, + "loss": 2.3313, + "step": 49500 + }, + { + "FLOPS loss": 0.10324012488126755, + "L0_d": 1068.78, + "MLM loss": 2.3727731704711914, + "epoch": 1.06, + "step": 49999 + }, + { + "epoch": 1.06, + "learning_rate": 9.184061224489796e-05, + "loss": 2.3325, + "step": 50000 + }, + { + "FLOPS loss": 0.08102630078792572, + "L0_d": 728.97, + "MLM loss": 2.3263208866119385, + "epoch": 1.08, + "step": 50499 + }, + { + "epoch": 1.08, + "learning_rate": 9.173857142857143e-05, + "loss": 2.328, + "step": 50500 + }, + { + "FLOPS loss": 0.07676102221012115, + "L0_d": 721.47, + "MLM loss": 2.250537395477295, + "epoch": 1.09, + "step": 50999 + }, + { + "epoch": 1.09, + "learning_rate": 9.16365306122449e-05, + "loss": 2.3182, + "step": 51000 + }, + { + "FLOPS loss": 0.08790870010852814, + "L0_d": 903.42, + "MLM loss": 2.2065377235412598, + "epoch": 1.1, + "step": 51499 + }, + { + "epoch": 1.1, + "learning_rate": 9.153469387755102e-05, + "loss": 2.3219, + "step": 51500 + }, + { + "FLOPS loss": 0.06780925393104553, + "L0_d": 552.94, + "MLM loss": 2.3474671840667725, + "epoch": 1.11, + "step": 51999 + }, + { + "epoch": 1.11, + "learning_rate": 9.14326530612245e-05, + "loss": 2.3219, + "step": 52000 + }, + { + "FLOPS loss": 0.08725595474243164, + "L0_d": 719.84, + "MLM loss": 2.1877591609954834, + "epoch": 1.12, + "step": 52499 + }, + { + "epoch": 1.12, + "learning_rate": 9.133061224489796e-05, + "loss": 2.3138, + "step": 52500 + }, + { + "FLOPS loss": 0.0755017101764679, + "L0_d": 583.75, + "MLM loss": 2.151200771331787, + "epoch": 1.13, + "step": 52999 + }, + { + "epoch": 1.13, + "learning_rate": 9.122857142857143e-05, + "loss": 2.3121, + "step": 53000 + }, + { + "FLOPS loss": 0.08711685240268707, + "L0_d": 698.66, + "MLM loss": 2.409646987915039, + "epoch": 1.14, + "step": 53499 + }, + { + "epoch": 1.14, + "learning_rate": 9.112673469387756e-05, + "loss": 2.3146, + "step": 53500 + }, + { + "FLOPS loss": 0.08219663053750992, + "L0_d": 696.67, + "MLM loss": 2.1099956035614014, + "epoch": 1.15, + "step": 53999 + }, + { + "epoch": 1.15, + "learning_rate": 9.102469387755103e-05, + "loss": 2.3065, + "step": 54000 + }, + { + "FLOPS loss": 0.0765044316649437, + "L0_d": 606.97, + "MLM loss": 2.2134242057800293, + "epoch": 1.16, + "step": 54499 + }, + { + "epoch": 1.16, + "learning_rate": 9.092265306122449e-05, + "loss": 2.3057, + "step": 54500 + }, + { + "FLOPS loss": 0.09635642915964127, + "L0_d": 905.95, + "MLM loss": 2.1616404056549072, + "epoch": 1.17, + "step": 54999 + }, + { + "epoch": 1.17, + "learning_rate": 9.082061224489796e-05, + "loss": 2.3022, + "step": 55000 + }, + { + "FLOPS loss": 0.09710858017206192, + "L0_d": 1261.88, + "MLM loss": 2.2462549209594727, + "epoch": 1.18, + "step": 55499 + }, + { + "epoch": 1.18, + "learning_rate": 9.071877551020409e-05, + "loss": 2.2989, + "step": 55500 + }, + { + "FLOPS loss": 0.0736132338643074, + "L0_d": 562.47, + "MLM loss": 2.2699902057647705, + "epoch": 1.19, + "step": 55999 + }, + { + "epoch": 1.19, + "learning_rate": 9.061673469387755e-05, + "loss": 2.3008, + "step": 56000 + }, + { + "FLOPS loss": 0.07662803679704666, + "L0_d": 789.95, + "MLM loss": 2.221466064453125, + "epoch": 1.2, + "step": 56499 + }, + { + "epoch": 1.2, + "learning_rate": 9.051469387755103e-05, + "loss": 2.2922, + "step": 56500 + }, + { + "FLOPS loss": 0.07387778162956238, + "L0_d": 666.42, + "MLM loss": 1.9749205112457275, + "epoch": 1.21, + "step": 56999 + }, + { + "epoch": 1.21, + "learning_rate": 9.04126530612245e-05, + "loss": 2.2898, + "step": 57000 + }, + { + "FLOPS loss": 0.06573477387428284, + "L0_d": 532.02, + "MLM loss": 2.511476755142212, + "epoch": 1.22, + "step": 57499 + }, + { + "epoch": 1.22, + "learning_rate": 9.031061224489796e-05, + "loss": 2.2877, + "step": 57500 + }, + { + "FLOPS loss": 0.07265348732471466, + "L0_d": 901.97, + "MLM loss": 2.1760964393615723, + "epoch": 1.24, + "step": 57999 + }, + { + "epoch": 1.24, + "learning_rate": 9.02087755102041e-05, + "loss": 2.2906, + "step": 58000 + }, + { + "FLOPS loss": 0.08701298385858536, + "L0_d": 941.17, + "MLM loss": 2.3221704959869385, + "epoch": 1.25, + "step": 58499 + }, + { + "epoch": 1.25, + "learning_rate": 9.010673469387756e-05, + "loss": 2.2837, + "step": 58500 + }, + { + "FLOPS loss": 0.07435303926467896, + "L0_d": 884.02, + "MLM loss": 2.2115917205810547, + "epoch": 1.26, + "step": 58999 + }, + { + "epoch": 1.26, + "learning_rate": 9.000469387755103e-05, + "loss": 2.286, + "step": 59000 + }, + { + "FLOPS loss": 0.08176206052303314, + "L0_d": 952.75, + "MLM loss": 2.4079854488372803, + "epoch": 1.27, + "step": 59499 + }, + { + "epoch": 1.27, + "learning_rate": 8.990265306122449e-05, + "loss": 2.2774, + "step": 59500 + }, + { + "FLOPS loss": 0.08938340097665787, + "L0_d": 851.17, + "MLM loss": 2.175987958908081, + "epoch": 1.28, + "step": 59999 + }, + { + "epoch": 1.28, + "learning_rate": 8.980081632653061e-05, + "loss": 2.2775, + "step": 60000 + }, + { + "FLOPS loss": 0.09279684722423553, + "L0_d": 1012.95, + "MLM loss": 2.1451687812805176, + "epoch": 1.29, + "step": 60499 + }, + { + "epoch": 1.29, + "learning_rate": 8.969877551020408e-05, + "loss": 2.2741, + "step": 60500 + }, + { + "FLOPS loss": 0.08175905793905258, + "L0_d": 546.34, + "MLM loss": 2.071819543838501, + "epoch": 1.3, + "step": 60999 + }, + { + "epoch": 1.3, + "learning_rate": 8.959673469387755e-05, + "loss": 2.2753, + "step": 61000 + }, + { + "FLOPS loss": 0.07440430670976639, + "L0_d": 629.91, + "MLM loss": 2.1951260566711426, + "epoch": 1.31, + "step": 61499 + }, + { + "epoch": 1.31, + "learning_rate": 8.949469387755103e-05, + "loss": 2.2723, + "step": 61500 + }, + { + "FLOPS loss": 0.07724060118198395, + "L0_d": 933.22, + "MLM loss": 2.193014621734619, + "epoch": 1.32, + "step": 61999 + }, + { + "epoch": 1.32, + "learning_rate": 8.939285714285714e-05, + "loss": 2.2686, + "step": 62000 + }, + { + "FLOPS loss": 0.07582145184278488, + "L0_d": 789.0, + "MLM loss": 2.2125964164733887, + "epoch": 1.33, + "step": 62499 + }, + { + "epoch": 1.33, + "learning_rate": 8.929081632653062e-05, + "loss": 2.2678, + "step": 62500 + }, + { + "FLOPS loss": 0.08202211558818817, + "L0_d": 931.83, + "MLM loss": 2.278818368911743, + "epoch": 1.34, + "step": 62999 + }, + { + "epoch": 1.34, + "learning_rate": 8.918877551020408e-05, + "loss": 2.2621, + "step": 63000 + }, + { + "FLOPS loss": 0.08058538287878036, + "L0_d": 656.52, + "MLM loss": 2.1786534786224365, + "epoch": 1.35, + "step": 63499 + }, + { + "epoch": 1.35, + "learning_rate": 8.908673469387756e-05, + "loss": 2.2596, + "step": 63500 + }, + { + "FLOPS loss": 0.09940757602453232, + "L0_d": 681.25, + "MLM loss": 2.147231101989746, + "epoch": 1.36, + "step": 63999 + }, + { + "epoch": 1.36, + "learning_rate": 8.898489795918368e-05, + "loss": 2.2634, + "step": 64000 + }, + { + "FLOPS loss": 0.08605896681547165, + "L0_d": 1384.86, + "MLM loss": 2.206131935119629, + "epoch": 1.37, + "step": 64499 + }, + { + "epoch": 1.37, + "learning_rate": 8.888285714285715e-05, + "loss": 2.2593, + "step": 64500 + }, + { + "FLOPS loss": 0.06299304962158203, + "L0_d": 629.77, + "MLM loss": 2.2180962562561035, + "epoch": 1.38, + "step": 64999 + }, + { + "epoch": 1.38, + "learning_rate": 8.878081632653061e-05, + "loss": 2.2553, + "step": 65000 + }, + { + "FLOPS loss": 0.07634347677230835, + "L0_d": 772.73, + "MLM loss": 2.0626327991485596, + "epoch": 1.39, + "step": 65499 + }, + { + "epoch": 1.39, + "learning_rate": 8.867877551020408e-05, + "loss": 2.2522, + "step": 65500 + }, + { + "FLOPS loss": 0.08463647216558456, + "L0_d": 799.12, + "MLM loss": 2.189845085144043, + "epoch": 1.41, + "step": 65999 + }, + { + "epoch": 1.41, + "learning_rate": 8.857673469387755e-05, + "loss": 2.2519, + "step": 66000 + }, + { + "FLOPS loss": 0.0599059984087944, + "L0_d": 554.03, + "MLM loss": 2.150831937789917, + "epoch": 1.42, + "step": 66499 + }, + { + "epoch": 1.42, + "learning_rate": 8.847489795918367e-05, + "loss": 2.2505, + "step": 66500 + }, + { + "FLOPS loss": 0.06849193572998047, + "L0_d": 659.19, + "MLM loss": 2.07958722114563, + "epoch": 1.43, + "step": 66999 + }, + { + "epoch": 1.43, + "learning_rate": 8.837285714285715e-05, + "loss": 2.2467, + "step": 67000 + }, + { + "FLOPS loss": 0.07847350835800171, + "L0_d": 1053.48, + "MLM loss": 2.0575110912323, + "epoch": 1.44, + "step": 67499 + }, + { + "epoch": 1.44, + "learning_rate": 8.827081632653062e-05, + "loss": 2.2458, + "step": 67500 + }, + { + "FLOPS loss": 0.07554128021001816, + "L0_d": 573.47, + "MLM loss": 2.2414090633392334, + "epoch": 1.45, + "step": 67999 + }, + { + "epoch": 1.45, + "learning_rate": 8.816877551020408e-05, + "loss": 2.2414, + "step": 68000 + }, + { + "FLOPS loss": 0.07916034013032913, + "L0_d": 806.33, + "MLM loss": 2.220374584197998, + "epoch": 1.46, + "step": 68499 + }, + { + "epoch": 1.46, + "learning_rate": 8.806693877551022e-05, + "loss": 2.2412, + "step": 68500 + }, + { + "FLOPS loss": 0.06981582939624786, + "L0_d": 521.97, + "MLM loss": 2.2153103351593018, + "epoch": 1.47, + "step": 68999 + }, + { + "epoch": 1.47, + "learning_rate": 8.796489795918368e-05, + "loss": 2.2436, + "step": 69000 + }, + { + "FLOPS loss": 0.10114899277687073, + "L0_d": 862.02, + "MLM loss": 2.131760597229004, + "epoch": 1.48, + "step": 69499 + }, + { + "epoch": 1.48, + "learning_rate": 8.786285714285715e-05, + "loss": 2.237, + "step": 69500 + }, + { + "FLOPS loss": 0.0677991583943367, + "L0_d": 871.22, + "MLM loss": 2.202712297439575, + "epoch": 1.49, + "step": 69999 + }, + { + "epoch": 1.49, + "learning_rate": 8.776081632653061e-05, + "loss": 2.2359, + "step": 70000 + }, + { + "FLOPS loss": 0.06798284500837326, + "L0_d": 805.94, + "MLM loss": 2.167118787765503, + "epoch": 1.5, + "step": 70499 + }, + { + "epoch": 1.5, + "learning_rate": 8.765877551020409e-05, + "loss": 2.2326, + "step": 70500 + }, + { + "FLOPS loss": 0.07358480244874954, + "L0_d": 592.3, + "MLM loss": 2.184145927429199, + "epoch": 1.51, + "step": 70999 + }, + { + "epoch": 1.51, + "learning_rate": 8.755693877551021e-05, + "loss": 2.2309, + "step": 71000 + }, + { + "FLOPS loss": 0.05839638411998749, + "L0_d": 504.52, + "MLM loss": 2.011075019836426, + "epoch": 1.52, + "step": 71499 + }, + { + "epoch": 1.52, + "learning_rate": 8.745489795918367e-05, + "loss": 2.2312, + "step": 71500 + }, + { + "FLOPS loss": 0.07558299601078033, + "L0_d": 876.62, + "MLM loss": 2.282175302505493, + "epoch": 1.53, + "step": 71999 + }, + { + "epoch": 1.53, + "learning_rate": 8.735285714285715e-05, + "loss": 2.2261, + "step": 72000 + }, + { + "FLOPS loss": 0.07914137095212936, + "L0_d": 748.42, + "MLM loss": 2.159878969192505, + "epoch": 1.54, + "step": 72499 + }, + { + "epoch": 1.54, + "learning_rate": 8.725081632653062e-05, + "loss": 2.2303, + "step": 72500 + }, + { + "FLOPS loss": 0.07840941846370697, + "L0_d": 837.8, + "MLM loss": 2.1828160285949707, + "epoch": 1.55, + "step": 72999 + }, + { + "epoch": 1.55, + "learning_rate": 8.714897959183674e-05, + "loss": 2.2263, + "step": 73000 + }, + { + "FLOPS loss": 0.08968845009803772, + "L0_d": 834.58, + "MLM loss": 2.078836679458618, + "epoch": 1.57, + "step": 73499 + }, + { + "epoch": 1.57, + "learning_rate": 8.704693877551022e-05, + "loss": 2.2241, + "step": 73500 + }, + { + "FLOPS loss": 0.0636233538389206, + "L0_d": 760.53, + "MLM loss": 2.119175672531128, + "epoch": 1.58, + "step": 73999 + }, + { + "epoch": 1.58, + "learning_rate": 8.694489795918368e-05, + "loss": 2.2213, + "step": 74000 + }, + { + "FLOPS loss": 0.08825450390577316, + "L0_d": 1082.2, + "MLM loss": 2.0904204845428467, + "epoch": 1.59, + "step": 74499 + }, + { + "epoch": 1.59, + "learning_rate": 8.684285714285715e-05, + "loss": 2.2189, + "step": 74500 + }, + { + "FLOPS loss": 0.07530651986598969, + "L0_d": 793.81, + "MLM loss": 2.1687419414520264, + "epoch": 1.6, + "step": 74999 + }, + { + "epoch": 1.6, + "learning_rate": 8.674102040816327e-05, + "loss": 2.217, + "step": 75000 + }, + { + "FLOPS loss": 0.09109428524971008, + "L0_d": 926.31, + "MLM loss": 2.2190775871276855, + "epoch": 1.61, + "step": 75499 + }, + { + "epoch": 1.61, + "learning_rate": 8.663897959183674e-05, + "loss": 2.2167, + "step": 75500 + }, + { + "FLOPS loss": 0.08965260535478592, + "L0_d": 815.39, + "MLM loss": 2.24111270904541, + "epoch": 1.62, + "step": 75999 + }, + { + "epoch": 1.62, + "learning_rate": 8.653693877551021e-05, + "loss": 2.2158, + "step": 76000 + }, + { + "FLOPS loss": 0.08030744642019272, + "L0_d": 710.55, + "MLM loss": 2.096034049987793, + "epoch": 1.63, + "step": 76499 + }, + { + "epoch": 1.63, + "learning_rate": 8.643489795918369e-05, + "loss": 2.2151, + "step": 76500 + }, + { + "FLOPS loss": 0.08471863716840744, + "L0_d": 1109.98, + "MLM loss": 2.1691243648529053, + "epoch": 1.64, + "step": 76999 + }, + { + "epoch": 1.64, + "learning_rate": 8.633285714285715e-05, + "loss": 2.2155, + "step": 77000 + }, + { + "FLOPS loss": 0.0984283909201622, + "L0_d": 1047.89, + "MLM loss": 2.161064624786377, + "epoch": 1.65, + "step": 77499 + }, + { + "epoch": 1.65, + "learning_rate": 8.623102040816326e-05, + "loss": 2.209, + "step": 77500 + }, + { + "FLOPS loss": 0.08178388327360153, + "L0_d": 852.5, + "MLM loss": 2.168562173843384, + "epoch": 1.66, + "step": 77999 + }, + { + "epoch": 1.66, + "learning_rate": 8.612897959183674e-05, + "loss": 2.2078, + "step": 78000 + }, + { + "FLOPS loss": 0.09100330621004105, + "L0_d": 744.31, + "MLM loss": 2.0004794597625732, + "epoch": 1.67, + "step": 78499 + }, + { + "epoch": 1.67, + "learning_rate": 8.602693877551022e-05, + "loss": 2.2079, + "step": 78500 + }, + { + "FLOPS loss": 0.07992446422576904, + "L0_d": 707.66, + "MLM loss": 2.0477890968322754, + "epoch": 1.68, + "step": 78999 + }, + { + "epoch": 1.68, + "learning_rate": 8.592489795918368e-05, + "loss": 2.2054, + "step": 79000 + }, + { + "FLOPS loss": 0.07943347096443176, + "L0_d": 889.62, + "MLM loss": 2.190885305404663, + "epoch": 1.69, + "step": 79499 + }, + { + "epoch": 1.69, + "learning_rate": 8.58230612244898e-05, + "loss": 2.1998, + "step": 79500 + }, + { + "FLOPS loss": 0.06888856738805771, + "L0_d": 767.7, + "MLM loss": 2.1345748901367188, + "epoch": 1.7, + "step": 79999 + }, + { + "epoch": 1.7, + "learning_rate": 8.572102040816327e-05, + "loss": 2.2022, + "step": 80000 + }, + { + "FLOPS loss": 0.08001743257045746, + "L0_d": 809.2, + "MLM loss": 1.9894421100616455, + "epoch": 1.71, + "step": 80499 + }, + { + "epoch": 1.71, + "learning_rate": 8.561897959183673e-05, + "loss": 2.204, + "step": 80500 + }, + { + "FLOPS loss": 0.06890819221735, + "L0_d": 812.89, + "MLM loss": 2.2208876609802246, + "epoch": 1.72, + "step": 80999 + }, + { + "epoch": 1.72, + "learning_rate": 8.551693877551021e-05, + "loss": 2.1997, + "step": 81000 + }, + { + "FLOPS loss": 0.07821900397539139, + "L0_d": 687.19, + "MLM loss": 2.076125144958496, + "epoch": 1.74, + "step": 81499 + }, + { + "epoch": 1.74, + "learning_rate": 8.541510204081633e-05, + "loss": 2.197, + "step": 81500 + }, + { + "FLOPS loss": 0.09430748969316483, + "L0_d": 856.03, + "MLM loss": 2.220088481903076, + "epoch": 1.75, + "step": 81999 + }, + { + "epoch": 1.75, + "learning_rate": 8.53130612244898e-05, + "loss": 2.1946, + "step": 82000 + }, + { + "FLOPS loss": 0.0756566971540451, + "L0_d": 759.97, + "MLM loss": 1.9912607669830322, + "epoch": 1.76, + "step": 82499 + }, + { + "epoch": 1.76, + "learning_rate": 8.521102040816327e-05, + "loss": 2.1977, + "step": 82500 + }, + { + "FLOPS loss": 0.07251645624637604, + "L0_d": 630.61, + "MLM loss": 2.082219362258911, + "epoch": 1.77, + "step": 82999 + }, + { + "epoch": 1.77, + "learning_rate": 8.510897959183674e-05, + "loss": 2.1932, + "step": 83000 + }, + { + "FLOPS loss": 0.07821709662675858, + "L0_d": 845.48, + "MLM loss": 2.2003273963928223, + "epoch": 1.78, + "step": 83499 + }, + { + "epoch": 1.78, + "learning_rate": 8.50069387755102e-05, + "loss": 2.1929, + "step": 83500 + }, + { + "FLOPS loss": 0.08727140724658966, + "L0_d": 880.19, + "MLM loss": 2.121065139770508, + "epoch": 1.79, + "step": 83999 + }, + { + "epoch": 1.79, + "learning_rate": 8.490510204081634e-05, + "loss": 2.1897, + "step": 84000 + }, + { + "FLOPS loss": 0.07498069107532501, + "L0_d": 1045.14, + "MLM loss": 2.155193567276001, + "epoch": 1.8, + "step": 84499 + }, + { + "epoch": 1.8, + "learning_rate": 8.48030612244898e-05, + "loss": 2.1856, + "step": 84500 + }, + { + "FLOPS loss": 0.075527124106884, + "L0_d": 825.73, + "MLM loss": 2.001152515411377, + "epoch": 1.81, + "step": 84999 + }, + { + "epoch": 1.81, + "learning_rate": 8.470102040816327e-05, + "loss": 2.1876, + "step": 85000 + }, + { + "FLOPS loss": 0.06435954570770264, + "L0_d": 629.23, + "MLM loss": 2.0413312911987305, + "epoch": 1.82, + "step": 85499 + }, + { + "epoch": 1.82, + "learning_rate": 8.459897959183673e-05, + "loss": 2.1867, + "step": 85500 + }, + { + "FLOPS loss": 0.09223916381597519, + "L0_d": 719.11, + "MLM loss": 2.1891777515411377, + "epoch": 1.83, + "step": 85999 + }, + { + "epoch": 1.83, + "learning_rate": 8.449714285714286e-05, + "loss": 2.1867, + "step": 86000 + }, + { + "FLOPS loss": 0.06357712298631668, + "L0_d": 667.89, + "MLM loss": 2.057793140411377, + "epoch": 1.84, + "step": 86499 + }, + { + "epoch": 1.84, + "learning_rate": 8.439510204081633e-05, + "loss": 2.1813, + "step": 86500 + }, + { + "FLOPS loss": 0.06833140552043915, + "L0_d": 727.81, + "MLM loss": 2.049499988555908, + "epoch": 1.85, + "step": 86999 + }, + { + "epoch": 1.85, + "learning_rate": 8.42930612244898e-05, + "loss": 2.1804, + "step": 87000 + }, + { + "FLOPS loss": 0.06953922659158707, + "L0_d": 712.0, + "MLM loss": 1.9520695209503174, + "epoch": 1.86, + "step": 87499 + }, + { + "epoch": 1.86, + "learning_rate": 8.419102040816327e-05, + "loss": 2.1804, + "step": 87500 + }, + { + "FLOPS loss": 0.05486714467406273, + "L0_d": 557.69, + "MLM loss": 2.089158058166504, + "epoch": 1.87, + "step": 87999 + }, + { + "epoch": 1.87, + "learning_rate": 8.408918367346939e-05, + "loss": 2.1768, + "step": 88000 + }, + { + "FLOPS loss": 0.08773964643478394, + "L0_d": 780.02, + "MLM loss": 2.027557611465454, + "epoch": 1.88, + "step": 88499 + }, + { + "epoch": 1.88, + "learning_rate": 8.398714285714287e-05, + "loss": 2.1738, + "step": 88500 + }, + { + "FLOPS loss": 0.07384505122900009, + "L0_d": 974.44, + "MLM loss": 2.0310699939727783, + "epoch": 1.9, + "step": 88999 + }, + { + "epoch": 1.9, + "learning_rate": 8.388510204081634e-05, + "loss": 2.1804, + "step": 89000 + }, + { + "FLOPS loss": 0.07133940607309341, + "L0_d": 786.97, + "MLM loss": 2.072397232055664, + "epoch": 1.91, + "step": 89499 + }, + { + "epoch": 1.91, + "learning_rate": 8.37830612244898e-05, + "loss": 2.1774, + "step": 89500 + }, + { + "FLOPS loss": 0.08639474958181381, + "L0_d": 834.44, + "MLM loss": 2.0521416664123535, + "epoch": 1.92, + "step": 89999 + }, + { + "epoch": 1.92, + "learning_rate": 8.368122448979592e-05, + "loss": 2.1771, + "step": 90000 + }, + { + "FLOPS loss": 0.10411433130502701, + "L0_d": 976.3, + "MLM loss": 2.184450149536133, + "epoch": 1.93, + "step": 90499 + }, + { + "epoch": 1.93, + "learning_rate": 8.35791836734694e-05, + "loss": 2.1769, + "step": 90500 + }, + { + "FLOPS loss": 0.07786352187395096, + "L0_d": 972.97, + "MLM loss": 2.1770880222320557, + "epoch": 1.94, + "step": 90999 + }, + { + "epoch": 1.94, + "learning_rate": 8.347714285714286e-05, + "loss": 2.171, + "step": 91000 + }, + { + "FLOPS loss": 0.07419507950544357, + "L0_d": 904.19, + "MLM loss": 2.092041492462158, + "epoch": 1.95, + "step": 91499 + }, + { + "epoch": 1.95, + "learning_rate": 8.337510204081633e-05, + "loss": 2.1724, + "step": 91500 + }, + { + "FLOPS loss": 0.08307668566703796, + "L0_d": 829.2, + "MLM loss": 2.167854070663452, + "epoch": 1.96, + "step": 91999 + }, + { + "epoch": 1.96, + "learning_rate": 8.327326530612245e-05, + "loss": 2.1704, + "step": 92000 + }, + { + "FLOPS loss": 0.06959807127714157, + "L0_d": 839.39, + "MLM loss": 2.129701852798462, + "epoch": 1.97, + "step": 92499 + }, + { + "epoch": 1.97, + "learning_rate": 8.317122448979591e-05, + "loss": 2.1663, + "step": 92500 + }, + { + "FLOPS loss": 0.062158744782209396, + "L0_d": 598.09, + "MLM loss": 1.981863260269165, + "epoch": 1.98, + "step": 92999 + }, + { + "epoch": 1.98, + "learning_rate": 8.306918367346939e-05, + "loss": 2.1691, + "step": 93000 + }, + { + "FLOPS loss": 0.07570803910493851, + "L0_d": 539.5, + "MLM loss": 1.99395751953125, + "epoch": 1.99, + "step": 93499 + }, + { + "epoch": 1.99, + "learning_rate": 8.296714285714287e-05, + "loss": 2.1685, + "step": 93500 + }, + { + "FLOPS loss": 0.0987701267004013, + "L0_d": 1091.38, + "MLM loss": 2.118992805480957, + "epoch": 2.0, + "step": 93999 + }, + { + "epoch": 2.0, + "learning_rate": 8.286551020408163e-05, + "loss": 2.1647, + "step": 94000 + }, + { + "FLOPS loss": 0.06103205308318138, + "L0_d": 586.62, + "MLM loss": 2.1299803256988525, + "epoch": 2.01, + "step": 94499 + }, + { + "epoch": 2.01, + "learning_rate": 8.276346938775511e-05, + "loss": 2.1608, + "step": 94500 + }, + { + "FLOPS loss": 0.09433767199516296, + "L0_d": 764.81, + "MLM loss": 2.127784013748169, + "epoch": 2.02, + "step": 94999 + }, + { + "epoch": 2.02, + "learning_rate": 8.266142857142858e-05, + "loss": 2.1604, + "step": 95000 + }, + { + "FLOPS loss": 0.07245936244726181, + "L0_d": 831.86, + "MLM loss": 2.176636219024658, + "epoch": 2.03, + "step": 95499 + }, + { + "epoch": 2.03, + "learning_rate": 8.255938775510204e-05, + "loss": 2.1592, + "step": 95500 + }, + { + "FLOPS loss": 0.059117577970027924, + "L0_d": 507.25, + "MLM loss": 1.9163801670074463, + "epoch": 2.04, + "step": 95999 + }, + { + "epoch": 2.04, + "learning_rate": 8.245734693877552e-05, + "loss": 2.1556, + "step": 96000 + }, + { + "FLOPS loss": 0.07038378715515137, + "L0_d": 1127.86, + "MLM loss": 2.097640037536621, + "epoch": 2.06, + "step": 96499 + }, + { + "epoch": 2.06, + "learning_rate": 8.235530612244898e-05, + "loss": 2.1538, + "step": 96500 + }, + { + "FLOPS loss": 0.07318626344203949, + "L0_d": 607.73, + "MLM loss": 2.035457134246826, + "epoch": 2.07, + "step": 96999 + }, + { + "epoch": 2.07, + "learning_rate": 8.225326530612245e-05, + "loss": 2.1576, + "step": 97000 + }, + { + "FLOPS loss": 0.0791444182395935, + "L0_d": 816.31, + "MLM loss": 2.1670773029327393, + "epoch": 2.08, + "step": 97499 + }, + { + "epoch": 2.08, + "learning_rate": 8.215122448979591e-05, + "loss": 2.1552, + "step": 97500 + }, + { + "FLOPS loss": 0.07561291009187698, + "L0_d": 588.3, + "MLM loss": 2.1358835697174072, + "epoch": 2.09, + "step": 97999 + }, + { + "epoch": 2.09, + "learning_rate": 8.204938775510205e-05, + "loss": 2.1562, + "step": 98000 + }, + { + "FLOPS loss": 0.07940414547920227, + "L0_d": 718.41, + "MLM loss": 2.0383706092834473, + "epoch": 2.1, + "step": 98499 + }, + { + "epoch": 2.1, + "learning_rate": 8.194734693877551e-05, + "loss": 2.1527, + "step": 98500 + }, + { + "FLOPS loss": 0.06504429876804352, + "L0_d": 550.23, + "MLM loss": 2.195014476776123, + "epoch": 2.11, + "step": 98999 + }, + { + "epoch": 2.11, + "learning_rate": 8.184530612244898e-05, + "loss": 2.1497, + "step": 99000 + }, + { + "FLOPS loss": 0.069733627140522, + "L0_d": 642.94, + "MLM loss": 2.1821696758270264, + "epoch": 2.12, + "step": 99499 + }, + { + "epoch": 2.12, + "learning_rate": 8.174326530612246e-05, + "loss": 2.1516, + "step": 99500 + }, + { + "FLOPS loss": 0.07867594063282013, + "L0_d": 619.06, + "MLM loss": 2.0050911903381348, + "epoch": 2.13, + "step": 99999 + }, + { + "epoch": 2.13, + "learning_rate": 8.164122448979592e-05, + "loss": 2.1481, + "step": 100000 + }, + { + "FLOPS loss": 0.08644675463438034, + "L0_d": 820.27, + "MLM loss": 1.9595686197280884, + "epoch": 2.14, + "step": 100499 + }, + { + "epoch": 2.14, + "learning_rate": 8.153938775510205e-05, + "loss": 2.148, + "step": 100500 + }, + { + "FLOPS loss": 0.061923496425151825, + "L0_d": 575.0, + "MLM loss": 2.031733989715576, + "epoch": 2.15, + "step": 100999 + }, + { + "epoch": 2.15, + "learning_rate": 8.143734693877552e-05, + "loss": 2.1482, + "step": 101000 + }, + { + "FLOPS loss": 0.07247359305620193, + "L0_d": 729.3, + "MLM loss": 2.111258029937744, + "epoch": 2.16, + "step": 101499 + }, + { + "epoch": 2.16, + "learning_rate": 8.133530612244898e-05, + "loss": 2.1423, + "step": 101500 + }, + { + "FLOPS loss": 0.07209628820419312, + "L0_d": 757.38, + "MLM loss": 2.086599826812744, + "epoch": 2.17, + "step": 101999 + }, + { + "epoch": 2.17, + "learning_rate": 8.123326530612245e-05, + "loss": 2.142, + "step": 102000 + }, + { + "FLOPS loss": 0.07832463830709457, + "L0_d": 702.0, + "MLM loss": 2.0252575874328613, + "epoch": 2.18, + "step": 102499 + }, + { + "epoch": 2.18, + "learning_rate": 8.113142857142858e-05, + "loss": 2.1446, + "step": 102500 + }, + { + "FLOPS loss": 0.07046867161989212, + "L0_d": 741.03, + "MLM loss": 2.137556552886963, + "epoch": 2.19, + "step": 102999 + }, + { + "epoch": 2.19, + "learning_rate": 8.102938775510205e-05, + "loss": 2.1419, + "step": 103000 + }, + { + "FLOPS loss": 0.0920414999127388, + "L0_d": 1184.77, + "MLM loss": 2.0097923278808594, + "epoch": 2.2, + "step": 103499 + }, + { + "epoch": 2.2, + "learning_rate": 8.092734693877551e-05, + "loss": 2.1431, + "step": 103500 + }, + { + "FLOPS loss": 0.06898413598537445, + "L0_d": 748.45, + "MLM loss": 2.123561382293701, + "epoch": 2.21, + "step": 103999 + }, + { + "epoch": 2.21, + "learning_rate": 8.082530612244899e-05, + "loss": 2.1411, + "step": 104000 + }, + { + "FLOPS loss": 0.07652837038040161, + "L0_d": 766.36, + "MLM loss": 2.1538445949554443, + "epoch": 2.23, + "step": 104499 + }, + { + "epoch": 2.23, + "learning_rate": 8.07234693877551e-05, + "loss": 2.1418, + "step": 104500 + }, + { + "FLOPS loss": 0.09778144955635071, + "L0_d": 956.75, + "MLM loss": 2.149035930633545, + "epoch": 2.24, + "step": 104999 + }, + { + "epoch": 2.24, + "learning_rate": 8.062142857142858e-05, + "loss": 2.1406, + "step": 105000 + }, + { + "FLOPS loss": 0.07300714403390884, + "L0_d": 606.64, + "MLM loss": 2.056023597717285, + "epoch": 2.25, + "step": 105499 + }, + { + "epoch": 2.25, + "learning_rate": 8.051938775510205e-05, + "loss": 2.1359, + "step": 105500 + }, + { + "FLOPS loss": 0.09272401034832001, + "L0_d": 801.7, + "MLM loss": 2.0067789554595947, + "epoch": 2.26, + "step": 105999 + }, + { + "epoch": 2.26, + "learning_rate": 8.041734693877552e-05, + "loss": 2.1394, + "step": 106000 + }, + { + "FLOPS loss": 0.08801499009132385, + "L0_d": 878.59, + "MLM loss": 2.035496234893799, + "epoch": 2.27, + "step": 106499 + }, + { + "epoch": 2.27, + "learning_rate": 8.031551020408164e-05, + "loss": 2.1345, + "step": 106500 + }, + { + "FLOPS loss": 0.08799153566360474, + "L0_d": 653.61, + "MLM loss": 2.0127854347229004, + "epoch": 2.28, + "step": 106999 + }, + { + "epoch": 2.28, + "learning_rate": 8.02134693877551e-05, + "loss": 2.1346, + "step": 107000 + }, + { + "FLOPS loss": 0.06928052008152008, + "L0_d": 940.64, + "MLM loss": 2.041571617126465, + "epoch": 2.29, + "step": 107499 + }, + { + "epoch": 2.29, + "learning_rate": 8.011142857142857e-05, + "loss": 2.1327, + "step": 107500 + }, + { + "FLOPS loss": 0.08908580243587494, + "L0_d": 769.25, + "MLM loss": 2.1204586029052734, + "epoch": 2.3, + "step": 107999 + }, + { + "epoch": 2.3, + "learning_rate": 8.000938775510205e-05, + "loss": 2.1354, + "step": 108000 + }, + { + "FLOPS loss": 0.08444768190383911, + "L0_d": 870.19, + "MLM loss": 2.0592079162597656, + "epoch": 2.31, + "step": 108499 + }, + { + "epoch": 2.31, + "learning_rate": 7.990755102040817e-05, + "loss": 2.1354, + "step": 108500 + }, + { + "FLOPS loss": 0.06808961927890778, + "L0_d": 588.69, + "MLM loss": 2.1283202171325684, + "epoch": 2.32, + "step": 108999 + }, + { + "epoch": 2.32, + "learning_rate": 7.980551020408163e-05, + "loss": 2.1278, + "step": 109000 + }, + { + "FLOPS loss": 0.07635576277971268, + "L0_d": 898.77, + "MLM loss": 1.9690206050872803, + "epoch": 2.33, + "step": 109499 + }, + { + "epoch": 2.33, + "learning_rate": 7.97034693877551e-05, + "loss": 2.1302, + "step": 109500 + }, + { + "FLOPS loss": 0.08134192228317261, + "L0_d": 714.91, + "MLM loss": 2.120096445083618, + "epoch": 2.34, + "step": 109999 + }, + { + "epoch": 2.34, + "learning_rate": 7.960142857142858e-05, + "loss": 2.1279, + "step": 110000 + }, + { + "FLOPS loss": 0.07751913368701935, + "L0_d": 571.03, + "MLM loss": 2.038954496383667, + "epoch": 2.35, + "step": 110499 + }, + { + "epoch": 2.35, + "learning_rate": 7.94995918367347e-05, + "loss": 2.1268, + "step": 110500 + }, + { + "FLOPS loss": 0.07746893912553787, + "L0_d": 878.41, + "MLM loss": 2.1648635864257812, + "epoch": 2.36, + "step": 110999 + }, + { + "epoch": 2.36, + "learning_rate": 7.939755102040816e-05, + "loss": 2.1271, + "step": 111000 + }, + { + "FLOPS loss": 0.07608819752931595, + "L0_d": 597.73, + "MLM loss": 1.9035899639129639, + "epoch": 2.37, + "step": 111499 + }, + { + "epoch": 2.37, + "learning_rate": 7.929551020408164e-05, + "loss": 2.1277, + "step": 111500 + }, + { + "FLOPS loss": 0.07868288457393646, + "L0_d": 842.05, + "MLM loss": 2.2060461044311523, + "epoch": 2.39, + "step": 111999 + }, + { + "epoch": 2.39, + "learning_rate": 7.91934693877551e-05, + "loss": 2.1268, + "step": 112000 + }, + { + "FLOPS loss": 0.06169697642326355, + "L0_d": 662.14, + "MLM loss": 2.0528273582458496, + "epoch": 2.4, + "step": 112499 + }, + { + "epoch": 2.4, + "learning_rate": 7.909163265306124e-05, + "loss": 2.1269, + "step": 112500 + }, + { + "FLOPS loss": 0.07529112696647644, + "L0_d": 815.22, + "MLM loss": 2.0095815658569336, + "epoch": 2.41, + "step": 112999 + }, + { + "epoch": 2.41, + "learning_rate": 7.89895918367347e-05, + "loss": 2.1241, + "step": 113000 + }, + { + "FLOPS loss": 0.0667743906378746, + "L0_d": 679.59, + "MLM loss": 2.011075258255005, + "epoch": 2.42, + "step": 113499 + }, + { + "epoch": 2.42, + "learning_rate": 7.888755102040817e-05, + "loss": 2.1167, + "step": 113500 + }, + { + "FLOPS loss": 0.08264929056167603, + "L0_d": 838.69, + "MLM loss": 1.9273202419281006, + "epoch": 2.43, + "step": 113999 + }, + { + "epoch": 2.43, + "learning_rate": 7.878551020408163e-05, + "loss": 2.1225, + "step": 114000 + }, + { + "FLOPS loss": 0.09935013949871063, + "L0_d": 979.97, + "MLM loss": 1.986405611038208, + "epoch": 2.44, + "step": 114499 + }, + { + "epoch": 2.44, + "learning_rate": 7.868367346938777e-05, + "loss": 2.1164, + "step": 114500 + }, + { + "FLOPS loss": 0.09770844131708145, + "L0_d": 787.3, + "MLM loss": 2.0416669845581055, + "epoch": 2.45, + "step": 114999 + }, + { + "epoch": 2.45, + "learning_rate": 7.858183673469389e-05, + "loss": 2.121, + "step": 115000 + }, + { + "FLOPS loss": 0.05972149968147278, + "L0_d": 543.11, + "MLM loss": 2.0023961067199707, + "epoch": 2.46, + "step": 115499 + }, + { + "epoch": 2.46, + "learning_rate": 7.847979591836735e-05, + "loss": 2.1165, + "step": 115500 + }, + { + "FLOPS loss": 0.08709295839071274, + "L0_d": 732.72, + "MLM loss": 2.027191638946533, + "epoch": 2.47, + "step": 115999 + }, + { + "epoch": 2.47, + "learning_rate": 7.837775510204082e-05, + "loss": 2.1156, + "step": 116000 + }, + { + "FLOPS loss": 0.08640507608652115, + "L0_d": 845.5, + "MLM loss": 2.0632519721984863, + "epoch": 2.48, + "step": 116499 + }, + { + "epoch": 2.48, + "learning_rate": 7.827571428571428e-05, + "loss": 2.1191, + "step": 116500 + }, + { + "FLOPS loss": 0.08174131065607071, + "L0_d": 931.69, + "MLM loss": 1.9130254983901978, + "epoch": 2.49, + "step": 116999 + }, + { + "epoch": 2.49, + "learning_rate": 7.817367346938776e-05, + "loss": 2.1117, + "step": 117000 + }, + { + "FLOPS loss": 0.08186709880828857, + "L0_d": 724.42, + "MLM loss": 2.023085117340088, + "epoch": 2.5, + "step": 117499 + }, + { + "epoch": 2.5, + "learning_rate": 7.807163265306124e-05, + "loss": 2.1155, + "step": 117500 + }, + { + "FLOPS loss": 0.08723706007003784, + "L0_d": 1007.92, + "MLM loss": 2.0281600952148438, + "epoch": 2.51, + "step": 117999 + }, + { + "epoch": 2.51, + "learning_rate": 7.79695918367347e-05, + "loss": 2.1153, + "step": 118000 + }, + { + "FLOPS loss": 0.07498479634523392, + "L0_d": 667.45, + "MLM loss": 1.9852380752563477, + "epoch": 2.52, + "step": 118499 + }, + { + "epoch": 2.52, + "learning_rate": 7.786755102040817e-05, + "loss": 2.1133, + "step": 118500 + }, + { + "FLOPS loss": 0.10051379352807999, + "L0_d": 1041.19, + "MLM loss": 1.9548364877700806, + "epoch": 2.53, + "step": 118999 + }, + { + "epoch": 2.53, + "learning_rate": 7.776551020408163e-05, + "loss": 2.1133, + "step": 119000 + }, + { + "FLOPS loss": 0.08071957528591156, + "L0_d": 773.66, + "MLM loss": 2.1546220779418945, + "epoch": 2.54, + "step": 119499 + }, + { + "epoch": 2.54, + "learning_rate": 7.766367346938775e-05, + "loss": 2.1142, + "step": 119500 + }, + { + "FLOPS loss": 0.08354714512825012, + "L0_d": 889.59, + "MLM loss": 1.9811208248138428, + "epoch": 2.56, + "step": 119999 + }, + { + "epoch": 2.56, + "learning_rate": 7.756163265306123e-05, + "loss": 2.1146, + "step": 120000 + }, + { + "FLOPS loss": 0.07952099293470383, + "L0_d": 512.33, + "MLM loss": 2.0295233726501465, + "epoch": 2.57, + "step": 120499 + }, + { + "epoch": 2.57, + "learning_rate": 7.74595918367347e-05, + "loss": 2.1118, + "step": 120500 + }, + { + "FLOPS loss": 0.07554259896278381, + "L0_d": 694.92, + "MLM loss": 1.9999445676803589, + "epoch": 2.58, + "step": 120999 + }, + { + "epoch": 2.58, + "learning_rate": 7.735755102040817e-05, + "loss": 2.1113, + "step": 121000 + }, + { + "FLOPS loss": 0.07274837791919708, + "L0_d": 696.59, + "MLM loss": 2.136122465133667, + "epoch": 2.59, + "step": 121499 + }, + { + "epoch": 2.59, + "learning_rate": 7.725571428571428e-05, + "loss": 2.108, + "step": 121500 + }, + { + "FLOPS loss": 0.08334198594093323, + "L0_d": 939.45, + "MLM loss": 2.0925543308258057, + "epoch": 2.6, + "step": 121999 + }, + { + "epoch": 2.6, + "learning_rate": 7.715367346938776e-05, + "loss": 2.1041, + "step": 122000 + }, + { + "FLOPS loss": 0.07802562415599823, + "L0_d": 719.03, + "MLM loss": 1.9580026865005493, + "epoch": 2.61, + "step": 122499 + }, + { + "epoch": 2.61, + "learning_rate": 7.705163265306122e-05, + "loss": 2.1062, + "step": 122500 + }, + { + "FLOPS loss": 0.07663872092962265, + "L0_d": 767.58, + "MLM loss": 2.0476346015930176, + "epoch": 2.62, + "step": 122999 + }, + { + "epoch": 2.62, + "learning_rate": 7.694979591836736e-05, + "loss": 2.1065, + "step": 123000 + }, + { + "FLOPS loss": 0.07221131026744843, + "L0_d": 614.86, + "MLM loss": 2.0577306747436523, + "epoch": 2.63, + "step": 123499 + }, + { + "epoch": 2.63, + "learning_rate": 7.684775510204082e-05, + "loss": 2.1038, + "step": 123500 + }, + { + "FLOPS loss": 0.07997844368219376, + "L0_d": 788.98, + "MLM loss": 1.9502489566802979, + "epoch": 2.64, + "step": 123999 + }, + { + "epoch": 2.64, + "learning_rate": 7.674571428571429e-05, + "loss": 2.1017, + "step": 124000 + }, + { + "FLOPS loss": 0.07191063463687897, + "L0_d": 812.03, + "MLM loss": 1.900945782661438, + "epoch": 2.65, + "step": 124499 + }, + { + "epoch": 2.65, + "learning_rate": 7.664367346938775e-05, + "loss": 2.1041, + "step": 124500 + }, + { + "FLOPS loss": 0.07377645373344421, + "L0_d": 741.73, + "MLM loss": 1.984440803527832, + "epoch": 2.66, + "step": 124999 + }, + { + "epoch": 2.66, + "learning_rate": 7.654163265306123e-05, + "loss": 2.1027, + "step": 125000 + }, + { + "FLOPS loss": 0.07682693749666214, + "L0_d": 721.17, + "MLM loss": 2.003563165664673, + "epoch": 2.67, + "step": 125499 + }, + { + "epoch": 2.67, + "learning_rate": 7.643959183673471e-05, + "loss": 2.1, + "step": 125500 + }, + { + "FLOPS loss": 0.08019888401031494, + "L0_d": 720.73, + "MLM loss": 2.0952367782592773, + "epoch": 2.68, + "step": 125999 + }, + { + "epoch": 2.68, + "learning_rate": 7.633755102040817e-05, + "loss": 2.1036, + "step": 126000 + }, + { + "FLOPS loss": 0.08629703521728516, + "L0_d": 741.38, + "MLM loss": 1.9909753799438477, + "epoch": 2.69, + "step": 126499 + }, + { + "epoch": 2.69, + "learning_rate": 7.623551020408164e-05, + "loss": 2.1026, + "step": 126500 + }, + { + "FLOPS loss": 0.0715538039803505, + "L0_d": 745.28, + "MLM loss": 2.0442450046539307, + "epoch": 2.7, + "step": 126999 + }, + { + "epoch": 2.7, + "learning_rate": 7.613367346938776e-05, + "loss": 2.1007, + "step": 127000 + }, + { + "FLOPS loss": 0.06486863642930984, + "L0_d": 602.3, + "MLM loss": 2.0304067134857178, + "epoch": 2.72, + "step": 127499 + }, + { + "epoch": 2.72, + "learning_rate": 7.603163265306122e-05, + "loss": 2.0963, + "step": 127500 + }, + { + "FLOPS loss": 0.07741992175579071, + "L0_d": 633.12, + "MLM loss": 2.092017650604248, + "epoch": 2.73, + "step": 127999 + }, + { + "epoch": 2.73, + "learning_rate": 7.592959183673469e-05, + "loss": 2.0985, + "step": 128000 + }, + { + "FLOPS loss": 0.055709708482027054, + "L0_d": 678.59, + "MLM loss": 1.9564106464385986, + "epoch": 2.74, + "step": 128499 + }, + { + "epoch": 2.74, + "learning_rate": 7.582755102040817e-05, + "loss": 2.0985, + "step": 128500 + }, + { + "FLOPS loss": 0.08097654581069946, + "L0_d": 947.98, + "MLM loss": 2.071964740753174, + "epoch": 2.75, + "step": 128999 + }, + { + "epoch": 2.75, + "learning_rate": 7.572571428571429e-05, + "loss": 2.098, + "step": 129000 + }, + { + "FLOPS loss": 0.08121536672115326, + "L0_d": 914.73, + "MLM loss": 1.9944313764572144, + "epoch": 2.76, + "step": 129499 + }, + { + "epoch": 2.76, + "learning_rate": 7.562367346938775e-05, + "loss": 2.092, + "step": 129500 + }, + { + "FLOPS loss": 0.07965188473463058, + "L0_d": 735.81, + "MLM loss": 2.1375985145568848, + "epoch": 2.77, + "step": 129999 + }, + { + "epoch": 2.77, + "learning_rate": 7.552163265306123e-05, + "loss": 2.0923, + "step": 130000 + }, + { + "FLOPS loss": 0.06054941192269325, + "L0_d": 546.17, + "MLM loss": 2.0754427909851074, + "epoch": 2.78, + "step": 130499 + }, + { + "epoch": 2.78, + "learning_rate": 7.54195918367347e-05, + "loss": 2.0946, + "step": 130500 + }, + { + "FLOPS loss": 0.07722670584917068, + "L0_d": 1042.67, + "MLM loss": 2.0651721954345703, + "epoch": 2.79, + "step": 130999 + }, + { + "epoch": 2.79, + "learning_rate": 7.531775510204082e-05, + "loss": 2.0917, + "step": 131000 + }, + { + "FLOPS loss": 0.08554383367300034, + "L0_d": 919.33, + "MLM loss": 2.1933531761169434, + "epoch": 2.8, + "step": 131499 + }, + { + "epoch": 2.8, + "learning_rate": 7.52157142857143e-05, + "loss": 2.0932, + "step": 131500 + }, + { + "FLOPS loss": 0.09546613693237305, + "L0_d": 709.77, + "MLM loss": 2.0176644325256348, + "epoch": 2.81, + "step": 131999 + }, + { + "epoch": 2.81, + "learning_rate": 7.511367346938776e-05, + "loss": 2.0914, + "step": 132000 + }, + { + "FLOPS loss": 0.08541066944599152, + "L0_d": 823.94, + "MLM loss": 2.066415786743164, + "epoch": 2.82, + "step": 132499 + }, + { + "epoch": 2.82, + "learning_rate": 7.501163265306122e-05, + "loss": 2.0921, + "step": 132500 + }, + { + "FLOPS loss": 0.08676711469888687, + "L0_d": 714.61, + "MLM loss": 2.1011126041412354, + "epoch": 2.83, + "step": 132999 + }, + { + "epoch": 2.83, + "learning_rate": 7.490979591836736e-05, + "loss": 2.0941, + "step": 133000 + }, + { + "FLOPS loss": 0.06921195238828659, + "L0_d": 802.39, + "MLM loss": 1.955210566520691, + "epoch": 2.84, + "step": 133499 + }, + { + "epoch": 2.84, + "learning_rate": 7.480775510204082e-05, + "loss": 2.0922, + "step": 133500 + }, + { + "FLOPS loss": 0.07848220318555832, + "L0_d": 554.2, + "MLM loss": 2.0789244174957275, + "epoch": 2.85, + "step": 133999 + }, + { + "epoch": 2.85, + "learning_rate": 7.470571428571429e-05, + "loss": 2.0911, + "step": 134000 + }, + { + "FLOPS loss": 0.08104442059993744, + "L0_d": 967.77, + "MLM loss": 2.0796117782592773, + "epoch": 2.86, + "step": 134499 + }, + { + "epoch": 2.86, + "learning_rate": 7.460367346938776e-05, + "loss": 2.0915, + "step": 134500 + }, + { + "FLOPS loss": 0.08678069710731506, + "L0_d": 963.12, + "MLM loss": 2.073706865310669, + "epoch": 2.87, + "step": 134999 + }, + { + "epoch": 2.87, + "learning_rate": 7.450183673469389e-05, + "loss": 2.0891, + "step": 135000 + }, + { + "FLOPS loss": 0.07000401616096497, + "L0_d": 784.11, + "MLM loss": 1.962247610092163, + "epoch": 2.89, + "step": 135499 + }, + { + "epoch": 2.89, + "learning_rate": 7.439979591836735e-05, + "loss": 2.0861, + "step": 135500 + }, + { + "FLOPS loss": 0.06950665265321732, + "L0_d": 535.03, + "MLM loss": 2.1357204914093018, + "epoch": 2.9, + "step": 135999 + }, + { + "epoch": 2.9, + "learning_rate": 7.429795918367347e-05, + "loss": 2.0883, + "step": 136000 + }, + { + "FLOPS loss": 0.08367476612329483, + "L0_d": 646.3, + "MLM loss": 1.8792662620544434, + "epoch": 2.91, + "step": 136499 + }, + { + "epoch": 2.91, + "learning_rate": 7.419591836734694e-05, + "loss": 2.0846, + "step": 136500 + }, + { + "FLOPS loss": 0.06122388690710068, + "L0_d": 699.14, + "MLM loss": 1.9720458984375, + "epoch": 2.92, + "step": 136999 + }, + { + "epoch": 2.92, + "learning_rate": 7.409387755102041e-05, + "loss": 2.0875, + "step": 137000 + }, + { + "FLOPS loss": 0.0852053165435791, + "L0_d": 877.64, + "MLM loss": 1.9794983863830566, + "epoch": 2.93, + "step": 137499 + }, + { + "epoch": 2.93, + "learning_rate": 7.399183673469388e-05, + "loss": 2.0881, + "step": 137500 + }, + { + "FLOPS loss": 0.09250415861606598, + "L0_d": 736.42, + "MLM loss": 2.1271352767944336, + "epoch": 2.94, + "step": 137999 + }, + { + "epoch": 2.94, + "learning_rate": 7.388979591836736e-05, + "loss": 2.0844, + "step": 138000 + }, + { + "FLOPS loss": 0.09295733273029327, + "L0_d": 944.11, + "MLM loss": 1.9891127347946167, + "epoch": 2.95, + "step": 138499 + }, + { + "epoch": 2.95, + "learning_rate": 7.378775510204082e-05, + "loss": 2.083, + "step": 138500 + }, + { + "FLOPS loss": 0.069969043135643, + "L0_d": 701.23, + "MLM loss": 1.936417818069458, + "epoch": 2.96, + "step": 138999 + }, + { + "epoch": 2.96, + "learning_rate": 7.368571428571429e-05, + "loss": 2.0795, + "step": 139000 + }, + { + "FLOPS loss": 0.06195145100355148, + "L0_d": 546.84, + "MLM loss": 1.9803211688995361, + "epoch": 2.97, + "step": 139499 + }, + { + "epoch": 2.97, + "learning_rate": 7.358367346938776e-05, + "loss": 2.083, + "step": 139500 + }, + { + "FLOPS loss": 0.07750408351421356, + "L0_d": 571.36, + "MLM loss": 1.9305696487426758, + "epoch": 2.98, + "step": 139999 + }, + { + "epoch": 2.98, + "learning_rate": 7.348183673469387e-05, + "loss": 2.0841, + "step": 140000 + }, + { + "FLOPS loss": 0.08334793150424957, + "L0_d": 848.39, + "MLM loss": 1.814992904663086, + "epoch": 2.99, + "step": 140499 + }, + { + "epoch": 2.99, + "learning_rate": 7.337979591836735e-05, + "loss": 2.0783, + "step": 140500 + }, + { + "FLOPS loss": 0.07291968166828156, + "L0_d": 730.91, + "MLM loss": 2.0424416065216064, + "epoch": 3.0, + "step": 140999 + }, + { + "epoch": 3.0, + "learning_rate": 7.327775510204083e-05, + "loss": 2.0796, + "step": 141000 + }, + { + "FLOPS loss": 0.08008062839508057, + "L0_d": 827.06, + "MLM loss": 2.070688486099243, + "epoch": 3.01, + "step": 141499 + }, + { + "epoch": 3.01, + "learning_rate": 7.317571428571429e-05, + "loss": 2.0768, + "step": 141500 + }, + { + "FLOPS loss": 0.07891778647899628, + "L0_d": 1120.58, + "MLM loss": 2.0585923194885254, + "epoch": 3.02, + "step": 141999 + }, + { + "epoch": 3.02, + "learning_rate": 7.307387755102041e-05, + "loss": 2.0781, + "step": 142000 + }, + { + "FLOPS loss": 0.08013079315423965, + "L0_d": 939.69, + "MLM loss": 2.0557148456573486, + "epoch": 3.03, + "step": 142499 + }, + { + "epoch": 3.03, + "learning_rate": 7.297183673469388e-05, + "loss": 2.0711, + "step": 142500 + }, + { + "FLOPS loss": 0.07039299607276917, + "L0_d": 884.61, + "MLM loss": 2.26658034324646, + "epoch": 3.05, + "step": 142999 + }, + { + "epoch": 3.05, + "learning_rate": 7.286979591836734e-05, + "loss": 2.0762, + "step": 143000 + }, + { + "FLOPS loss": 0.08341009169816971, + "L0_d": 1099.41, + "MLM loss": 1.9736123085021973, + "epoch": 3.06, + "step": 143499 + }, + { + "epoch": 3.06, + "learning_rate": 7.276775510204082e-05, + "loss": 2.0726, + "step": 143500 + }, + { + "FLOPS loss": 0.07099974900484085, + "L0_d": 677.59, + "MLM loss": 1.942285180091858, + "epoch": 3.07, + "step": 143999 + }, + { + "epoch": 3.07, + "learning_rate": 7.26657142857143e-05, + "loss": 2.0704, + "step": 144000 + }, + { + "FLOPS loss": 0.07856574654579163, + "L0_d": 963.98, + "MLM loss": 1.9899259805679321, + "epoch": 3.08, + "step": 144499 + }, + { + "epoch": 3.08, + "learning_rate": 7.256367346938776e-05, + "loss": 2.0739, + "step": 144500 + }, + { + "FLOPS loss": 0.07401182502508163, + "L0_d": 681.78, + "MLM loss": 1.916780948638916, + "epoch": 3.09, + "step": 144999 + }, + { + "epoch": 3.09, + "learning_rate": 7.246183673469387e-05, + "loss": 2.0708, + "step": 145000 + }, + { + "FLOPS loss": 0.09351439774036407, + "L0_d": 556.44, + "MLM loss": 2.0708470344543457, + "epoch": 3.1, + "step": 145499 + }, + { + "epoch": 3.1, + "learning_rate": 7.235979591836735e-05, + "loss": 2.0742, + "step": 145500 + }, + { + "FLOPS loss": 0.07324228435754776, + "L0_d": 875.48, + "MLM loss": 2.053828477859497, + "epoch": 3.11, + "step": 145999 + }, + { + "epoch": 3.11, + "learning_rate": 7.225775510204083e-05, + "loss": 2.0709, + "step": 146000 + }, + { + "FLOPS loss": 0.08776471018791199, + "L0_d": 757.81, + "MLM loss": 2.021151065826416, + "epoch": 3.12, + "step": 146499 + }, + { + "epoch": 3.12, + "learning_rate": 7.215571428571429e-05, + "loss": 2.0748, + "step": 146500 + }, + { + "FLOPS loss": 0.07924683392047882, + "L0_d": 737.95, + "MLM loss": 2.0102930068969727, + "epoch": 3.13, + "step": 146999 + }, + { + "epoch": 3.13, + "learning_rate": 7.205367346938776e-05, + "loss": 2.0692, + "step": 147000 + }, + { + "FLOPS loss": 0.07468462735414505, + "L0_d": 579.72, + "MLM loss": 2.31913685798645, + "epoch": 3.14, + "step": 147499 + }, + { + "epoch": 3.14, + "learning_rate": 7.195183673469388e-05, + "loss": 2.0714, + "step": 147500 + }, + { + "FLOPS loss": 0.07593423873186111, + "L0_d": 786.03, + "MLM loss": 1.9522225856781006, + "epoch": 3.15, + "step": 147999 + }, + { + "epoch": 3.15, + "learning_rate": 7.184979591836734e-05, + "loss": 2.0678, + "step": 148000 + }, + { + "FLOPS loss": 0.08743783086538315, + "L0_d": 833.62, + "MLM loss": 2.035630702972412, + "epoch": 3.16, + "step": 148499 + }, + { + "epoch": 3.16, + "learning_rate": 7.174775510204082e-05, + "loss": 2.0673, + "step": 148500 + }, + { + "FLOPS loss": 0.06403844803571701, + "L0_d": 583.55, + "MLM loss": 1.965914011001587, + "epoch": 3.17, + "step": 148999 + }, + { + "epoch": 3.17, + "learning_rate": 7.16457142857143e-05, + "loss": 2.0652, + "step": 149000 + }, + { + "FLOPS loss": 0.07395175844430923, + "L0_d": 928.45, + "MLM loss": 1.829352855682373, + "epoch": 3.18, + "step": 149499 + }, + { + "epoch": 3.18, + "learning_rate": 7.154367346938776e-05, + "loss": 2.067, + "step": 149500 + }, + { + "FLOPS loss": 0.07722048461437225, + "L0_d": 903.33, + "MLM loss": 1.9278677701950073, + "epoch": 3.19, + "step": 149999 + }, + { + "epoch": 3.19, + "learning_rate": 7.144183673469388e-05, + "loss": 2.0659, + "step": 150000 + }, + { + "FLOPS loss": 0.072312131524086, + "L0_d": 516.84, + "MLM loss": 1.9207533597946167, + "epoch": 3.2, + "step": 150499 + }, + { + "epoch": 3.2, + "learning_rate": 7.133979591836735e-05, + "loss": 2.0657, + "step": 150500 + }, + { + "FLOPS loss": 0.10924256592988968, + "L0_d": 1259.34, + "MLM loss": 1.9205459356307983, + "epoch": 3.22, + "step": 150999 + }, + { + "epoch": 3.22, + "learning_rate": 7.123775510204081e-05, + "loss": 2.0636, + "step": 151000 + }, + { + "FLOPS loss": 0.0764654129743576, + "L0_d": 890.97, + "MLM loss": 2.1768999099731445, + "epoch": 3.23, + "step": 151499 + }, + { + "epoch": 3.23, + "learning_rate": 7.113571428571429e-05, + "loss": 2.0645, + "step": 151500 + }, + { + "FLOPS loss": 0.07931890338659286, + "L0_d": 1040.11, + "MLM loss": 1.8496606349945068, + "epoch": 3.24, + "step": 151999 + }, + { + "epoch": 3.24, + "learning_rate": 7.103367346938776e-05, + "loss": 2.0592, + "step": 152000 + }, + { + "FLOPS loss": 0.0654386356472969, + "L0_d": 611.86, + "MLM loss": 1.8984403610229492, + "epoch": 3.25, + "step": 152499 + }, + { + "epoch": 3.25, + "learning_rate": 7.093183673469388e-05, + "loss": 2.0637, + "step": 152500 + }, + { + "FLOPS loss": 0.07457588613033295, + "L0_d": 571.36, + "MLM loss": 1.9137520790100098, + "epoch": 3.26, + "step": 152999 + }, + { + "epoch": 3.26, + "learning_rate": 7.082979591836734e-05, + "loss": 2.06, + "step": 153000 + }, + { + "FLOPS loss": 0.09647884219884872, + "L0_d": 1122.12, + "MLM loss": 2.0544707775115967, + "epoch": 3.27, + "step": 153499 + }, + { + "epoch": 3.27, + "learning_rate": 7.072775510204082e-05, + "loss": 2.0605, + "step": 153500 + }, + { + "FLOPS loss": 0.08411688357591629, + "L0_d": 904.66, + "MLM loss": 1.8506371974945068, + "epoch": 3.28, + "step": 153999 + }, + { + "epoch": 3.28, + "learning_rate": 7.062571428571428e-05, + "loss": 2.0626, + "step": 154000 + }, + { + "FLOPS loss": 0.08170817792415619, + "L0_d": 845.39, + "MLM loss": 2.004265069961548, + "epoch": 3.29, + "step": 154499 + }, + { + "epoch": 3.29, + "learning_rate": 7.05238775510204e-05, + "loss": 2.0625, + "step": 154500 + }, + { + "FLOPS loss": 0.08299441635608673, + "L0_d": 780.56, + "MLM loss": 2.034635543823242, + "epoch": 3.3, + "step": 154999 + }, + { + "epoch": 3.3, + "learning_rate": 7.042183673469388e-05, + "loss": 2.0619, + "step": 155000 + }, + { + "FLOPS loss": 0.07230028510093689, + "L0_d": 614.62, + "MLM loss": 1.9387309551239014, + "epoch": 3.31, + "step": 155499 + }, + { + "epoch": 3.31, + "learning_rate": 7.031979591836735e-05, + "loss": 2.0595, + "step": 155500 + }, + { + "FLOPS loss": 0.06355029344558716, + "L0_d": 695.94, + "MLM loss": 1.9958910942077637, + "epoch": 3.32, + "step": 155999 + }, + { + "epoch": 3.32, + "learning_rate": 7.021775510204081e-05, + "loss": 2.0613, + "step": 156000 + }, + { + "FLOPS loss": 0.07059838622808456, + "L0_d": 713.44, + "MLM loss": 1.919121503829956, + "epoch": 3.33, + "step": 156499 + }, + { + "epoch": 3.33, + "learning_rate": 7.011571428571429e-05, + "loss": 2.0596, + "step": 156500 + }, + { + "FLOPS loss": 0.07795380800962448, + "L0_d": 713.16, + "MLM loss": 1.9082648754119873, + "epoch": 3.34, + "step": 156999 + }, + { + "epoch": 3.34, + "learning_rate": 7.001387755102041e-05, + "loss": 2.0574, + "step": 157000 + }, + { + "FLOPS loss": 0.08485406637191772, + "L0_d": 786.42, + "MLM loss": 1.8752378225326538, + "epoch": 3.35, + "step": 157499 + }, + { + "epoch": 3.35, + "learning_rate": 6.991183673469388e-05, + "loss": 2.0575, + "step": 157500 + }, + { + "FLOPS loss": 0.08777129650115967, + "L0_d": 797.16, + "MLM loss": 2.156707286834717, + "epoch": 3.36, + "step": 157999 + }, + { + "epoch": 3.36, + "learning_rate": 6.980979591836735e-05, + "loss": 2.0576, + "step": 158000 + }, + { + "FLOPS loss": 0.07295342534780502, + "L0_d": 668.17, + "MLM loss": 1.9450191259384155, + "epoch": 3.38, + "step": 158499 + }, + { + "epoch": 3.38, + "learning_rate": 6.970775510204082e-05, + "loss": 2.059, + "step": 158500 + }, + { + "FLOPS loss": 0.08175527304410934, + "L0_d": 751.41, + "MLM loss": 2.059969186782837, + "epoch": 3.39, + "step": 158999 + }, + { + "epoch": 3.39, + "learning_rate": 6.960591836734694e-05, + "loss": 2.0571, + "step": 159000 + }, + { + "FLOPS loss": 0.09450782090425491, + "L0_d": 1096.31, + "MLM loss": 1.8040263652801514, + "epoch": 3.4, + "step": 159499 + }, + { + "epoch": 3.4, + "learning_rate": 6.950387755102042e-05, + "loss": 2.0554, + "step": 159500 + }, + { + "FLOPS loss": 0.08854455500841141, + "L0_d": 894.12, + "MLM loss": 1.8809754848480225, + "epoch": 3.41, + "step": 159999 + }, + { + "epoch": 3.41, + "learning_rate": 6.940183673469388e-05, + "loss": 2.0546, + "step": 160000 + }, + { + "FLOPS loss": 0.07546830177307129, + "L0_d": 837.89, + "MLM loss": 2.0773215293884277, + "epoch": 3.42, + "step": 160499 + }, + { + "epoch": 3.42, + "learning_rate": 6.929979591836735e-05, + "loss": 2.0534, + "step": 160500 + }, + { + "FLOPS loss": 0.06834632158279419, + "L0_d": 618.45, + "MLM loss": 1.9398113489151, + "epoch": 3.43, + "step": 160999 + }, + { + "epoch": 3.43, + "learning_rate": 6.919775510204081e-05, + "loss": 2.0519, + "step": 161000 + }, + { + "FLOPS loss": 0.08442704379558563, + "L0_d": 801.73, + "MLM loss": 1.9613676071166992, + "epoch": 3.44, + "step": 161499 + }, + { + "epoch": 3.44, + "learning_rate": 6.909591836734695e-05, + "loss": 2.0518, + "step": 161500 + }, + { + "FLOPS loss": 0.08665428310632706, + "L0_d": 717.11, + "MLM loss": 2.046541213989258, + "epoch": 3.45, + "step": 161999 + }, + { + "epoch": 3.45, + "learning_rate": 6.899387755102041e-05, + "loss": 2.0489, + "step": 162000 + }, + { + "FLOPS loss": 0.08299516886472702, + "L0_d": 726.27, + "MLM loss": 1.958557367324829, + "epoch": 3.46, + "step": 162499 + }, + { + "epoch": 3.46, + "learning_rate": 6.889183673469388e-05, + "loss": 2.0539, + "step": 162500 + }, + { + "FLOPS loss": 0.08221839368343353, + "L0_d": 634.72, + "MLM loss": 1.9704405069351196, + "epoch": 3.47, + "step": 162999 + }, + { + "epoch": 3.47, + "learning_rate": 6.878979591836735e-05, + "loss": 2.0522, + "step": 163000 + }, + { + "FLOPS loss": 0.0814078077673912, + "L0_d": 800.78, + "MLM loss": 1.8931515216827393, + "epoch": 3.48, + "step": 163499 + }, + { + "epoch": 3.48, + "learning_rate": 6.868795918367348e-05, + "loss": 2.0502, + "step": 163500 + }, + { + "FLOPS loss": 0.08102941513061523, + "L0_d": 779.53, + "MLM loss": 1.9019794464111328, + "epoch": 3.49, + "step": 163999 + }, + { + "epoch": 3.49, + "learning_rate": 6.858591836734694e-05, + "loss": 2.0502, + "step": 164000 + }, + { + "FLOPS loss": 0.07698705792427063, + "L0_d": 880.31, + "MLM loss": 2.169793128967285, + "epoch": 3.5, + "step": 164499 + }, + { + "epoch": 3.5, + "learning_rate": 6.848387755102042e-05, + "loss": 2.0495, + "step": 164500 + }, + { + "FLOPS loss": 0.07340797036886215, + "L0_d": 836.19, + "MLM loss": 2.1057276725769043, + "epoch": 3.51, + "step": 164999 + }, + { + "epoch": 3.51, + "learning_rate": 6.838183673469388e-05, + "loss": 2.0522, + "step": 165000 + }, + { + "FLOPS loss": 0.06688041239976883, + "L0_d": 516.97, + "MLM loss": 1.8922147750854492, + "epoch": 3.52, + "step": 165499 + }, + { + "epoch": 3.52, + "learning_rate": 6.827979591836735e-05, + "loss": 2.0489, + "step": 165500 + }, + { + "FLOPS loss": 0.07872175425291061, + "L0_d": 844.69, + "MLM loss": 2.0002338886260986, + "epoch": 3.54, + "step": 165999 + }, + { + "epoch": 3.54, + "learning_rate": 6.817795918367347e-05, + "loss": 2.0477, + "step": 166000 + }, + { + "FLOPS loss": 0.06916598230600357, + "L0_d": 696.31, + "MLM loss": 1.8614131212234497, + "epoch": 3.55, + "step": 166499 + }, + { + "epoch": 3.55, + "learning_rate": 6.807591836734695e-05, + "loss": 2.0469, + "step": 166500 + }, + { + "FLOPS loss": 0.08415335416793823, + "L0_d": 721.41, + "MLM loss": 2.031548500061035, + "epoch": 3.56, + "step": 166999 + }, + { + "epoch": 3.56, + "learning_rate": 6.797387755102041e-05, + "loss": 2.0474, + "step": 167000 + }, + { + "FLOPS loss": 0.06985273212194443, + "L0_d": 651.58, + "MLM loss": 1.9898622035980225, + "epoch": 3.57, + "step": 167499 + }, + { + "epoch": 3.57, + "learning_rate": 6.787183673469389e-05, + "loss": 2.0465, + "step": 167500 + }, + { + "FLOPS loss": 0.07171157002449036, + "L0_d": 630.3, + "MLM loss": 2.079214096069336, + "epoch": 3.58, + "step": 167999 + }, + { + "epoch": 3.58, + "learning_rate": 6.776979591836735e-05, + "loss": 2.0488, + "step": 168000 + }, + { + "FLOPS loss": 0.10304520279169083, + "L0_d": 889.3, + "MLM loss": 1.9510843753814697, + "epoch": 3.59, + "step": 168499 + }, + { + "epoch": 3.59, + "learning_rate": 6.766775510204082e-05, + "loss": 2.0445, + "step": 168500 + }, + { + "FLOPS loss": 0.07319284975528717, + "L0_d": 823.53, + "MLM loss": 1.9499073028564453, + "epoch": 3.6, + "step": 168999 + }, + { + "epoch": 3.6, + "learning_rate": 6.756591836734694e-05, + "loss": 2.0458, + "step": 169000 + }, + { + "FLOPS loss": 0.07182403653860092, + "L0_d": 741.36, + "MLM loss": 1.8717212677001953, + "epoch": 3.61, + "step": 169499 + }, + { + "epoch": 3.61, + "learning_rate": 6.746387755102042e-05, + "loss": 2.0428, + "step": 169500 + }, + { + "FLOPS loss": 0.08154419809579849, + "L0_d": 823.67, + "MLM loss": 1.9579507112503052, + "epoch": 3.62, + "step": 169999 + }, + { + "epoch": 3.62, + "learning_rate": 6.736183673469388e-05, + "loss": 2.0461, + "step": 170000 + }, + { + "FLOPS loss": 0.07501042634248734, + "L0_d": 733.5, + "MLM loss": 1.9381325244903564, + "epoch": 3.63, + "step": 170499 + }, + { + "epoch": 3.63, + "learning_rate": 6.725979591836735e-05, + "loss": 2.0417, + "step": 170500 + }, + { + "FLOPS loss": 0.07502731680870056, + "L0_d": 756.14, + "MLM loss": 2.078585624694824, + "epoch": 3.64, + "step": 170999 + }, + { + "epoch": 3.64, + "learning_rate": 6.715775510204083e-05, + "loss": 2.0407, + "step": 171000 + }, + { + "FLOPS loss": 0.07120244204998016, + "L0_d": 591.41, + "MLM loss": 2.0549476146698, + "epoch": 3.65, + "step": 171499 + }, + { + "epoch": 3.65, + "learning_rate": 6.705571428571429e-05, + "loss": 2.0434, + "step": 171500 + }, + { + "FLOPS loss": 0.08011957257986069, + "L0_d": 699.62, + "MLM loss": 1.9056050777435303, + "epoch": 3.66, + "step": 171999 + }, + { + "epoch": 3.66, + "learning_rate": 6.695367346938775e-05, + "loss": 2.0408, + "step": 172000 + }, + { + "FLOPS loss": 0.09575420618057251, + "L0_d": 1030.78, + "MLM loss": 1.995476484298706, + "epoch": 3.67, + "step": 172499 + }, + { + "epoch": 3.67, + "learning_rate": 6.685183673469389e-05, + "loss": 2.043, + "step": 172500 + }, + { + "FLOPS loss": 0.08612006902694702, + "L0_d": 676.3, + "MLM loss": 2.0726420879364014, + "epoch": 3.68, + "step": 172999 + }, + { + "epoch": 3.68, + "learning_rate": 6.674979591836735e-05, + "loss": 2.0417, + "step": 173000 + }, + { + "FLOPS loss": 0.06685136258602142, + "L0_d": 1046.97, + "MLM loss": 1.9004302024841309, + "epoch": 3.69, + "step": 173499 + }, + { + "epoch": 3.69, + "learning_rate": 6.664775510204082e-05, + "loss": 2.0411, + "step": 173500 + }, + { + "FLOPS loss": 0.0729975476861, + "L0_d": 785.67, + "MLM loss": 2.033959150314331, + "epoch": 3.71, + "step": 173999 + }, + { + "epoch": 3.71, + "learning_rate": 6.654571428571428e-05, + "loss": 2.0404, + "step": 174000 + }, + { + "FLOPS loss": 0.07711607962846756, + "L0_d": 713.72, + "MLM loss": 1.8837366104125977, + "epoch": 3.72, + "step": 174499 + }, + { + "epoch": 3.72, + "learning_rate": 6.644367346938776e-05, + "loss": 2.0394, + "step": 174500 + }, + { + "FLOPS loss": 0.07960330694913864, + "L0_d": 759.89, + "MLM loss": 1.871514081954956, + "epoch": 3.73, + "step": 174999 + }, + { + "epoch": 3.73, + "learning_rate": 6.634183673469388e-05, + "loss": 2.0399, + "step": 175000 + }, + { + "FLOPS loss": 0.08591853827238083, + "L0_d": 1124.56, + "MLM loss": 1.882832646369934, + "epoch": 3.74, + "step": 175499 + }, + { + "epoch": 3.74, + "learning_rate": 6.623979591836735e-05, + "loss": 2.0366, + "step": 175500 + }, + { + "FLOPS loss": 0.06938300281763077, + "L0_d": 552.39, + "MLM loss": 2.0231130123138428, + "epoch": 3.75, + "step": 175999 + }, + { + "epoch": 3.75, + "learning_rate": 6.613775510204083e-05, + "loss": 2.0372, + "step": 176000 + }, + { + "FLOPS loss": 0.06626979261636734, + "L0_d": 547.67, + "MLM loss": 1.9859817028045654, + "epoch": 3.76, + "step": 176499 + }, + { + "epoch": 3.76, + "learning_rate": 6.603571428571429e-05, + "loss": 2.0393, + "step": 176500 + }, + { + "FLOPS loss": 0.08292733132839203, + "L0_d": 876.09, + "MLM loss": 1.8628358840942383, + "epoch": 3.77, + "step": 176999 + }, + { + "epoch": 3.77, + "learning_rate": 6.593367346938775e-05, + "loss": 2.0362, + "step": 177000 + }, + { + "FLOPS loss": 0.0899147242307663, + "L0_d": 759.33, + "MLM loss": 1.9406108856201172, + "epoch": 3.78, + "step": 177499 + }, + { + "epoch": 3.78, + "learning_rate": 6.583183673469389e-05, + "loss": 2.0333, + "step": 177500 + }, + { + "FLOPS loss": 0.07696028053760529, + "L0_d": 644.02, + "MLM loss": 2.1251962184906006, + "epoch": 3.79, + "step": 177999 + }, + { + "epoch": 3.79, + "learning_rate": 6.572979591836735e-05, + "loss": 2.0329, + "step": 178000 + }, + { + "FLOPS loss": 0.07335279136896133, + "L0_d": 671.97, + "MLM loss": 1.8094111680984497, + "epoch": 3.8, + "step": 178499 + }, + { + "epoch": 3.8, + "learning_rate": 6.562775510204082e-05, + "loss": 2.0354, + "step": 178500 + }, + { + "FLOPS loss": 0.05993517488241196, + "L0_d": 629.39, + "MLM loss": 1.9419050216674805, + "epoch": 3.81, + "step": 178999 + }, + { + "epoch": 3.81, + "learning_rate": 6.55257142857143e-05, + "loss": 2.0345, + "step": 179000 + }, + { + "FLOPS loss": 0.08473289757966995, + "L0_d": 770.55, + "MLM loss": 1.9759211540222168, + "epoch": 3.82, + "step": 179499 + }, + { + "epoch": 3.82, + "learning_rate": 6.54238775510204e-05, + "loss": 2.0384, + "step": 179500 + }, + { + "FLOPS loss": 0.07711312174797058, + "L0_d": 760.28, + "MLM loss": 1.8868836164474487, + "epoch": 3.83, + "step": 179999 + }, + { + "epoch": 3.83, + "learning_rate": 6.532183673469388e-05, + "loss": 2.0349, + "step": 180000 + }, + { + "FLOPS loss": 0.0859721377491951, + "L0_d": 608.56, + "MLM loss": 1.9114336967468262, + "epoch": 3.84, + "step": 180499 + }, + { + "epoch": 3.84, + "learning_rate": 6.521979591836736e-05, + "loss": 2.0295, + "step": 180500 + }, + { + "FLOPS loss": 0.08973046392202377, + "L0_d": 883.16, + "MLM loss": 1.7612589597702026, + "epoch": 3.85, + "step": 180999 + }, + { + "epoch": 3.85, + "learning_rate": 6.511775510204082e-05, + "loss": 2.0286, + "step": 181000 + }, + { + "FLOPS loss": 0.06494830548763275, + "L0_d": 711.47, + "MLM loss": 1.9608688354492188, + "epoch": 3.87, + "step": 181499 + }, + { + "epoch": 3.87, + "learning_rate": 6.501571428571429e-05, + "loss": 2.0352, + "step": 181500 + }, + { + "FLOPS loss": 0.08908434212207794, + "L0_d": 922.59, + "MLM loss": 1.9575316905975342, + "epoch": 3.88, + "step": 181999 + }, + { + "epoch": 3.88, + "learning_rate": 6.491387755102041e-05, + "loss": 2.0276, + "step": 182000 + }, + { + "FLOPS loss": 0.07852374762296677, + "L0_d": 824.45, + "MLM loss": 2.036099433898926, + "epoch": 3.89, + "step": 182499 + }, + { + "epoch": 3.89, + "learning_rate": 6.481183673469387e-05, + "loss": 2.0341, + "step": 182500 + }, + { + "FLOPS loss": 0.06813400238752365, + "L0_d": 739.58, + "MLM loss": 1.91340970993042, + "epoch": 3.9, + "step": 182999 + }, + { + "epoch": 3.9, + "learning_rate": 6.470979591836735e-05, + "loss": 2.0341, + "step": 183000 + }, + { + "FLOPS loss": 0.08716423064470291, + "L0_d": 1196.8, + "MLM loss": 1.9538612365722656, + "epoch": 3.91, + "step": 183499 + }, + { + "epoch": 3.91, + "learning_rate": 6.460775510204082e-05, + "loss": 2.0333, + "step": 183500 + }, + { + "FLOPS loss": 0.07709267735481262, + "L0_d": 814.75, + "MLM loss": 1.9605598449707031, + "epoch": 3.92, + "step": 183999 + }, + { + "epoch": 3.92, + "learning_rate": 6.45057142857143e-05, + "loss": 2.0297, + "step": 184000 + }, + { + "FLOPS loss": 0.08532577753067017, + "L0_d": 1042.2, + "MLM loss": 1.9132826328277588, + "epoch": 3.93, + "step": 184499 + }, + { + "epoch": 3.93, + "learning_rate": 6.440367346938776e-05, + "loss": 2.028, + "step": 184500 + }, + { + "FLOPS loss": 0.08234952390193939, + "L0_d": 884.34, + "MLM loss": 1.7370164394378662, + "epoch": 3.94, + "step": 184999 + }, + { + "epoch": 3.94, + "learning_rate": 6.430183673469388e-05, + "loss": 2.0298, + "step": 185000 + }, + { + "FLOPS loss": 0.058814145624637604, + "L0_d": 477.72, + "MLM loss": 1.8557758331298828, + "epoch": 3.95, + "step": 185499 + }, + { + "epoch": 3.95, + "learning_rate": 6.419979591836735e-05, + "loss": 2.0295, + "step": 185500 + }, + { + "FLOPS loss": 0.06794530153274536, + "L0_d": 697.06, + "MLM loss": 1.875330924987793, + "epoch": 3.96, + "step": 185999 + }, + { + "epoch": 3.96, + "learning_rate": 6.409775510204082e-05, + "loss": 2.0248, + "step": 186000 + }, + { + "FLOPS loss": 0.07963329553604126, + "L0_d": 585.33, + "MLM loss": 1.9809973239898682, + "epoch": 3.97, + "step": 186499 + }, + { + "epoch": 3.97, + "learning_rate": 6.399571428571429e-05, + "loss": 2.0245, + "step": 186500 + }, + { + "FLOPS loss": 0.08781524002552032, + "L0_d": 1159.09, + "MLM loss": 2.064220905303955, + "epoch": 3.98, + "step": 186999 + }, + { + "epoch": 3.98, + "learning_rate": 6.389387755102041e-05, + "loss": 2.024, + "step": 187000 + }, + { + "FLOPS loss": 0.07237225025892258, + "L0_d": 772.23, + "MLM loss": 2.056314468383789, + "epoch": 3.99, + "step": 187499 + }, + { + "epoch": 3.99, + "learning_rate": 6.379183673469387e-05, + "loss": 2.0272, + "step": 187500 + }, + { + "FLOPS loss": 0.07651927322149277, + "L0_d": 869.56, + "MLM loss": 1.8988327980041504, + "epoch": 4.0, + "step": 187999 + }, + { + "epoch": 4.0, + "learning_rate": 6.369000000000001e-05, + "loss": 2.0273, + "step": 188000 + }, + { + "FLOPS loss": 0.06517531722784042, + "L0_d": 584.02, + "MLM loss": 2.0192251205444336, + "epoch": 4.01, + "step": 188499 + }, + { + "epoch": 4.01, + "learning_rate": 6.358795918367347e-05, + "loss": 2.0263, + "step": 188500 + }, + { + "FLOPS loss": 0.0843081995844841, + "L0_d": 947.58, + "MLM loss": 1.9554133415222168, + "epoch": 4.02, + "step": 188999 + }, + { + "epoch": 4.02, + "learning_rate": 6.348591836734694e-05, + "loss": 2.0245, + "step": 189000 + }, + { + "FLOPS loss": 0.0910363495349884, + "L0_d": 998.47, + "MLM loss": 1.9304252862930298, + "epoch": 4.04, + "step": 189499 + }, + { + "epoch": 4.04, + "learning_rate": 6.338387755102042e-05, + "loss": 2.0206, + "step": 189500 + }, + { + "FLOPS loss": 0.08949656784534454, + "L0_d": 986.08, + "MLM loss": 1.988331913948059, + "epoch": 4.05, + "step": 189999 + }, + { + "epoch": 4.05, + "learning_rate": 6.328183673469388e-05, + "loss": 2.0216, + "step": 190000 + }, + { + "FLOPS loss": 0.07536551356315613, + "L0_d": 1129.55, + "MLM loss": 1.943612813949585, + "epoch": 4.06, + "step": 190499 + }, + { + "epoch": 4.06, + "learning_rate": 6.317979591836735e-05, + "loss": 2.0232, + "step": 190500 + }, + { + "FLOPS loss": 0.10584509372711182, + "L0_d": 1000.84, + "MLM loss": 1.8180073499679565, + "epoch": 4.07, + "step": 190999 + }, + { + "epoch": 4.07, + "learning_rate": 6.307775510204081e-05, + "loss": 2.0192, + "step": 191000 + }, + { + "FLOPS loss": 0.09958112239837646, + "L0_d": 976.05, + "MLM loss": 1.8265557289123535, + "epoch": 4.08, + "step": 191499 + }, + { + "epoch": 4.08, + "learning_rate": 6.297571428571429e-05, + "loss": 2.0236, + "step": 191500 + }, + { + "FLOPS loss": 0.06521303206682205, + "L0_d": 791.47, + "MLM loss": 2.189824104309082, + "epoch": 4.09, + "step": 191999 + }, + { + "epoch": 4.09, + "learning_rate": 6.287367346938777e-05, + "loss": 2.0216, + "step": 192000 + }, + { + "FLOPS loss": 0.07057648152112961, + "L0_d": 820.94, + "MLM loss": 1.8760672807693481, + "epoch": 4.1, + "step": 192499 + }, + { + "epoch": 4.1, + "learning_rate": 6.277163265306123e-05, + "loss": 2.0241, + "step": 192500 + }, + { + "FLOPS loss": 0.09540248662233353, + "L0_d": 1057.05, + "MLM loss": 1.9012943506240845, + "epoch": 4.11, + "step": 192999 + }, + { + "epoch": 4.11, + "learning_rate": 6.26695918367347e-05, + "loss": 2.018, + "step": 193000 + }, + { + "FLOPS loss": 0.05730264261364937, + "L0_d": 657.75, + "MLM loss": 1.9492607116699219, + "epoch": 4.12, + "step": 193499 + }, + { + "epoch": 4.12, + "learning_rate": 6.256775510204082e-05, + "loss": 2.0174, + "step": 193500 + }, + { + "FLOPS loss": 0.0790947675704956, + "L0_d": 768.97, + "MLM loss": 1.8801112174987793, + "epoch": 4.13, + "step": 193999 + }, + { + "epoch": 4.13, + "learning_rate": 6.24657142857143e-05, + "loss": 2.0205, + "step": 194000 + }, + { + "FLOPS loss": 0.05512884259223938, + "L0_d": 577.09, + "MLM loss": 1.9257984161376953, + "epoch": 4.14, + "step": 194499 + }, + { + "epoch": 4.14, + "learning_rate": 6.236367346938776e-05, + "loss": 2.0185, + "step": 194500 + }, + { + "FLOPS loss": 0.08608260005712509, + "L0_d": 944.81, + "MLM loss": 2.005255699157715, + "epoch": 4.15, + "step": 194999 + }, + { + "epoch": 4.15, + "learning_rate": 6.226163265306122e-05, + "loss": 2.0164, + "step": 195000 + }, + { + "FLOPS loss": 0.07035192102193832, + "L0_d": 993.94, + "MLM loss": 1.9241690635681152, + "epoch": 4.16, + "step": 195499 + }, + { + "epoch": 4.16, + "learning_rate": 6.215979591836735e-05, + "loss": 2.019, + "step": 195500 + }, + { + "FLOPS loss": 0.06550294160842896, + "L0_d": 719.12, + "MLM loss": 1.9413261413574219, + "epoch": 4.17, + "step": 195999 + }, + { + "epoch": 4.17, + "learning_rate": 6.205775510204081e-05, + "loss": 2.0138, + "step": 196000 + }, + { + "FLOPS loss": 0.06137542426586151, + "L0_d": 583.73, + "MLM loss": 1.9586896896362305, + "epoch": 4.18, + "step": 196499 + }, + { + "epoch": 4.18, + "learning_rate": 6.195571428571429e-05, + "loss": 2.0185, + "step": 196500 + }, + { + "FLOPS loss": 0.06665011495351791, + "L0_d": 587.61, + "MLM loss": 1.8961763381958008, + "epoch": 4.2, + "step": 196999 + }, + { + "epoch": 4.2, + "learning_rate": 6.185367346938777e-05, + "loss": 2.0168, + "step": 197000 + }, + { + "FLOPS loss": 0.07283007353544235, + "L0_d": 620.45, + "MLM loss": 1.9708335399627686, + "epoch": 4.21, + "step": 197499 + }, + { + "epoch": 4.21, + "learning_rate": 6.175163265306123e-05, + "loss": 2.0135, + "step": 197500 + }, + { + "FLOPS loss": 0.07253549993038177, + "L0_d": 927.98, + "MLM loss": 1.9713668823242188, + "epoch": 4.22, + "step": 197999 + }, + { + "epoch": 4.22, + "learning_rate": 6.164979591836735e-05, + "loss": 2.0137, + "step": 198000 + }, + { + "FLOPS loss": 0.06769657135009766, + "L0_d": 544.48, + "MLM loss": 1.9186091423034668, + "epoch": 4.23, + "step": 198499 + }, + { + "epoch": 4.23, + "learning_rate": 6.154775510204082e-05, + "loss": 2.0174, + "step": 198500 + }, + { + "FLOPS loss": 0.08714200556278229, + "L0_d": 905.88, + "MLM loss": 1.8871827125549316, + "epoch": 4.24, + "step": 198999 + }, + { + "epoch": 4.24, + "learning_rate": 6.144571428571428e-05, + "loss": 2.015, + "step": 199000 + }, + { + "FLOPS loss": 0.07281828671693802, + "L0_d": 549.28, + "MLM loss": 1.9224956035614014, + "epoch": 4.25, + "step": 199499 + }, + { + "epoch": 4.25, + "learning_rate": 6.134367346938776e-05, + "loss": 2.0136, + "step": 199500 + }, + { + "FLOPS loss": 0.06316013634204865, + "L0_d": 645.33, + "MLM loss": 1.990523338317871, + "epoch": 4.26, + "step": 199999 + }, + { + "epoch": 4.26, + "learning_rate": 6.124183673469388e-05, + "loss": 2.018, + "step": 200000 + }, + { + "FLOPS loss": 0.09500326961278915, + "L0_d": 838.77, + "MLM loss": 1.8790405988693237, + "epoch": 4.27, + "step": 200499 + }, + { + "epoch": 4.27, + "learning_rate": 6.113979591836734e-05, + "loss": 2.0115, + "step": 200500 + }, + { + "FLOPS loss": 0.07647743076086044, + "L0_d": 746.02, + "MLM loss": 2.0634748935699463, + "epoch": 4.28, + "step": 200999 + }, + { + "epoch": 4.28, + "learning_rate": 6.103775510204082e-05, + "loss": 2.0179, + "step": 201000 + }, + { + "FLOPS loss": 0.06590724736452103, + "L0_d": 750.33, + "MLM loss": 2.047532081604004, + "epoch": 4.29, + "step": 201499 + }, + { + "epoch": 4.29, + "learning_rate": 6.093571428571429e-05, + "loss": 2.0102, + "step": 201500 + }, + { + "FLOPS loss": 0.0764516219496727, + "L0_d": 672.33, + "MLM loss": 1.8584916591644287, + "epoch": 4.3, + "step": 201999 + }, + { + "epoch": 4.3, + "learning_rate": 6.083367346938775e-05, + "loss": 2.0127, + "step": 202000 + }, + { + "FLOPS loss": 0.09220387041568756, + "L0_d": 1029.03, + "MLM loss": 1.7153816223144531, + "epoch": 4.31, + "step": 202499 + }, + { + "epoch": 4.31, + "learning_rate": 6.073183673469388e-05, + "loss": 2.0114, + "step": 202500 + }, + { + "FLOPS loss": 0.07743317633867264, + "L0_d": 651.98, + "MLM loss": 2.0165162086486816, + "epoch": 4.32, + "step": 202999 + }, + { + "epoch": 4.32, + "learning_rate": 6.062979591836735e-05, + "loss": 2.0095, + "step": 203000 + }, + { + "FLOPS loss": 0.05509522557258606, + "L0_d": 613.44, + "MLM loss": 2.139920949935913, + "epoch": 4.33, + "step": 203499 + }, + { + "epoch": 4.33, + "learning_rate": 6.0527755102040816e-05, + "loss": 2.011, + "step": 203500 + }, + { + "FLOPS loss": 0.07268443703651428, + "L0_d": 747.22, + "MLM loss": 1.8006410598754883, + "epoch": 4.34, + "step": 203999 + }, + { + "epoch": 4.34, + "learning_rate": 6.042571428571429e-05, + "loss": 2.0097, + "step": 204000 + }, + { + "FLOPS loss": 0.08287376165390015, + "L0_d": 773.5, + "MLM loss": 1.8289992809295654, + "epoch": 4.35, + "step": 204499 + }, + { + "epoch": 4.35, + "learning_rate": 6.0323877551020415e-05, + "loss": 2.0089, + "step": 204500 + }, + { + "FLOPS loss": 0.0774405300617218, + "L0_d": 866.12, + "MLM loss": 1.817805528640747, + "epoch": 4.37, + "step": 204999 + }, + { + "epoch": 4.37, + "learning_rate": 6.022183673469388e-05, + "loss": 2.0131, + "step": 205000 + }, + { + "FLOPS loss": 0.07295011729001999, + "L0_d": 856.09, + "MLM loss": 1.9340803623199463, + "epoch": 4.38, + "step": 205499 + }, + { + "epoch": 4.38, + "learning_rate": 6.011979591836735e-05, + "loss": 2.01, + "step": 205500 + }, + { + "FLOPS loss": 0.06522063910961151, + "L0_d": 941.17, + "MLM loss": 1.7702338695526123, + "epoch": 4.39, + "step": 205999 + }, + { + "epoch": 4.39, + "learning_rate": 6.0017755102040816e-05, + "loss": 2.0094, + "step": 206000 + }, + { + "FLOPS loss": 0.0726202130317688, + "L0_d": 824.72, + "MLM loss": 1.8889031410217285, + "epoch": 4.4, + "step": 206499 + }, + { + "epoch": 4.4, + "learning_rate": 5.991571428571429e-05, + "loss": 2.0043, + "step": 206500 + }, + { + "FLOPS loss": 0.08556578308343887, + "L0_d": 686.48, + "MLM loss": 1.9422351121902466, + "epoch": 4.41, + "step": 206999 + }, + { + "epoch": 4.41, + "learning_rate": 5.9813877551020415e-05, + "loss": 2.0111, + "step": 207000 + }, + { + "FLOPS loss": 0.0572795532643795, + "L0_d": 781.86, + "MLM loss": 2.0047783851623535, + "epoch": 4.42, + "step": 207499 + }, + { + "epoch": 4.42, + "learning_rate": 5.971183673469388e-05, + "loss": 2.0075, + "step": 207500 + }, + { + "FLOPS loss": 0.07259980589151382, + "L0_d": 992.34, + "MLM loss": 1.8562628030776978, + "epoch": 4.43, + "step": 207999 + }, + { + "epoch": 4.43, + "learning_rate": 5.960979591836735e-05, + "loss": 2.0033, + "step": 208000 + }, + { + "FLOPS loss": 0.08666159957647324, + "L0_d": 759.8, + "MLM loss": 1.9893381595611572, + "epoch": 4.44, + "step": 208499 + }, + { + "epoch": 4.44, + "learning_rate": 5.9507755102040816e-05, + "loss": 2.0072, + "step": 208500 + }, + { + "FLOPS loss": 0.07965318113565445, + "L0_d": 915.02, + "MLM loss": 2.064954996109009, + "epoch": 4.45, + "step": 208999 + }, + { + "epoch": 4.45, + "learning_rate": 5.940591836734695e-05, + "loss": 2.0075, + "step": 209000 + }, + { + "FLOPS loss": 0.06595531851053238, + "L0_d": 577.44, + "MLM loss": 2.1270761489868164, + "epoch": 4.46, + "step": 209499 + }, + { + "epoch": 4.46, + "learning_rate": 5.9303877551020415e-05, + "loss": 1.9995, + "step": 209500 + }, + { + "FLOPS loss": 0.08501884341239929, + "L0_d": 678.55, + "MLM loss": 1.8834452629089355, + "epoch": 4.47, + "step": 209999 + }, + { + "epoch": 4.47, + "learning_rate": 5.9201836734693886e-05, + "loss": 2.0079, + "step": 210000 + }, + { + "FLOPS loss": 0.07586806267499924, + "L0_d": 865.56, + "MLM loss": 1.942221760749817, + "epoch": 4.48, + "step": 210499 + }, + { + "epoch": 4.48, + "learning_rate": 5.909979591836735e-05, + "loss": 2.0091, + "step": 210500 + }, + { + "FLOPS loss": 0.07614757120609283, + "L0_d": 753.77, + "MLM loss": 1.9882011413574219, + "epoch": 4.49, + "step": 210999 + }, + { + "epoch": 4.49, + "learning_rate": 5.8997959183673465e-05, + "loss": 2.0061, + "step": 211000 + }, + { + "FLOPS loss": 0.07684236764907837, + "L0_d": 642.22, + "MLM loss": 1.8372766971588135, + "epoch": 4.5, + "step": 211499 + }, + { + "epoch": 4.5, + "learning_rate": 5.8895918367346936e-05, + "loss": 2.0079, + "step": 211500 + }, + { + "FLOPS loss": 0.08997940272092819, + "L0_d": 783.89, + "MLM loss": 1.818095088005066, + "epoch": 4.51, + "step": 211999 + }, + { + "epoch": 4.51, + "learning_rate": 5.8793877551020414e-05, + "loss": 2.0061, + "step": 212000 + }, + { + "FLOPS loss": 0.08491813391447067, + "L0_d": 1031.25, + "MLM loss": 1.9548670053482056, + "epoch": 4.53, + "step": 212499 + }, + { + "epoch": 4.53, + "learning_rate": 5.8691836734693886e-05, + "loss": 2.0034, + "step": 212500 + }, + { + "FLOPS loss": 0.09706159681081772, + "L0_d": 786.84, + "MLM loss": 1.8384318351745605, + "epoch": 4.54, + "step": 212999 + }, + { + "epoch": 4.54, + "learning_rate": 5.859e-05, + "loss": 1.9993, + "step": 213000 + }, + { + "FLOPS loss": 0.06909748166799545, + "L0_d": 762.97, + "MLM loss": 1.8404794931411743, + "epoch": 4.55, + "step": 213499 + }, + { + "epoch": 4.55, + "learning_rate": 5.848795918367347e-05, + "loss": 2.0042, + "step": 213500 + }, + { + "FLOPS loss": 0.06857288628816605, + "L0_d": 525.5, + "MLM loss": 1.8500382900238037, + "epoch": 4.56, + "step": 213999 + }, + { + "epoch": 4.56, + "learning_rate": 5.8385918367346936e-05, + "loss": 2.0063, + "step": 214000 + }, + { + "FLOPS loss": 0.07077398151159286, + "L0_d": 715.25, + "MLM loss": 1.8898844718933105, + "epoch": 4.57, + "step": 214499 + }, + { + "epoch": 4.57, + "learning_rate": 5.8283877551020414e-05, + "loss": 2.0038, + "step": 214500 + }, + { + "FLOPS loss": 0.08178684860467911, + "L0_d": 642.12, + "MLM loss": 1.9428343772888184, + "epoch": 4.58, + "step": 214999 + }, + { + "epoch": 4.58, + "learning_rate": 5.8181836734693886e-05, + "loss": 2.0035, + "step": 215000 + }, + { + "FLOPS loss": 0.07514248043298721, + "L0_d": 566.45, + "MLM loss": 1.8526628017425537, + "epoch": 4.59, + "step": 215499 + }, + { + "epoch": 4.59, + "learning_rate": 5.808e-05, + "loss": 2.0024, + "step": 215500 + }, + { + "FLOPS loss": 0.06888828426599503, + "L0_d": 753.84, + "MLM loss": 1.8139371871948242, + "epoch": 4.6, + "step": 215999 + }, + { + "epoch": 4.6, + "learning_rate": 5.797795918367347e-05, + "loss": 2.001, + "step": 216000 + }, + { + "FLOPS loss": 0.07311675697565079, + "L0_d": 1075.84, + "MLM loss": 1.8174786567687988, + "epoch": 4.61, + "step": 216499 + }, + { + "epoch": 4.61, + "learning_rate": 5.7875918367346936e-05, + "loss": 1.9985, + "step": 216500 + }, + { + "FLOPS loss": 0.0811404287815094, + "L0_d": 794.91, + "MLM loss": 1.907740831375122, + "epoch": 4.62, + "step": 216999 + }, + { + "epoch": 4.62, + "learning_rate": 5.777387755102041e-05, + "loss": 1.9996, + "step": 217000 + }, + { + "FLOPS loss": 0.07732479274272919, + "L0_d": 882.95, + "MLM loss": 1.8914384841918945, + "epoch": 4.63, + "step": 217499 + }, + { + "epoch": 4.63, + "learning_rate": 5.7671836734693885e-05, + "loss": 2.0016, + "step": 217500 + }, + { + "FLOPS loss": 0.08881332725286484, + "L0_d": 730.89, + "MLM loss": 1.7773964405059814, + "epoch": 4.64, + "step": 217999 + }, + { + "epoch": 4.64, + "learning_rate": 5.757e-05, + "loss": 2.0026, + "step": 218000 + }, + { + "FLOPS loss": 0.08179426938295364, + "L0_d": 1269.34, + "MLM loss": 1.8794455528259277, + "epoch": 4.65, + "step": 218499 + }, + { + "epoch": 4.65, + "learning_rate": 5.746795918367347e-05, + "loss": 2.0016, + "step": 218500 + }, + { + "FLOPS loss": 0.07119172066450119, + "L0_d": 803.27, + "MLM loss": 1.8622592687606812, + "epoch": 4.66, + "step": 218999 + }, + { + "epoch": 4.66, + "learning_rate": 5.7365918367346936e-05, + "loss": 1.9979, + "step": 219000 + }, + { + "FLOPS loss": 0.06549478322267532, + "L0_d": 638.23, + "MLM loss": 1.930425763130188, + "epoch": 4.67, + "step": 219499 + }, + { + "epoch": 4.67, + "learning_rate": 5.726387755102041e-05, + "loss": 1.9995, + "step": 219500 + }, + { + "FLOPS loss": 0.07244117558002472, + "L0_d": 1028.78, + "MLM loss": 1.8141852617263794, + "epoch": 4.69, + "step": 219999 + }, + { + "epoch": 4.69, + "learning_rate": 5.7161836734693885e-05, + "loss": 1.9981, + "step": 220000 + }, + { + "FLOPS loss": 0.07125560194253922, + "L0_d": 1015.98, + "MLM loss": 1.9103002548217773, + "epoch": 4.7, + "step": 220499 + }, + { + "epoch": 4.7, + "learning_rate": 5.706e-05, + "loss": 1.9935, + "step": 220500 + }, + { + "FLOPS loss": 0.10560493171215057, + "L0_d": 1126.92, + "MLM loss": 1.9834262132644653, + "epoch": 4.71, + "step": 220999 + }, + { + "epoch": 4.71, + "learning_rate": 5.695795918367347e-05, + "loss": 1.994, + "step": 221000 + }, + { + "FLOPS loss": 0.06856855005025864, + "L0_d": 628.53, + "MLM loss": 2.0253748893737793, + "epoch": 4.72, + "step": 221499 + }, + { + "epoch": 4.72, + "learning_rate": 5.6855918367346935e-05, + "loss": 1.9986, + "step": 221500 + }, + { + "FLOPS loss": 0.09187950193881989, + "L0_d": 1048.83, + "MLM loss": 1.8531248569488525, + "epoch": 4.73, + "step": 221999 + }, + { + "epoch": 4.73, + "learning_rate": 5.675387755102041e-05, + "loss": 1.9964, + "step": 222000 + }, + { + "FLOPS loss": 0.07964413613080978, + "L0_d": 870.2, + "MLM loss": 1.8161282539367676, + "epoch": 4.74, + "step": 222499 + }, + { + "epoch": 4.74, + "learning_rate": 5.665183673469388e-05, + "loss": 1.997, + "step": 222500 + }, + { + "FLOPS loss": 0.06449469178915024, + "L0_d": 703.88, + "MLM loss": 1.9795019626617432, + "epoch": 4.75, + "step": 222999 + }, + { + "epoch": 4.75, + "learning_rate": 5.6549795918367357e-05, + "loss": 1.9936, + "step": 223000 + }, + { + "FLOPS loss": 0.10031864047050476, + "L0_d": 774.0, + "MLM loss": 1.8794991970062256, + "epoch": 4.76, + "step": 223499 + }, + { + "epoch": 4.76, + "learning_rate": 5.644795918367347e-05, + "loss": 1.9922, + "step": 223500 + }, + { + "FLOPS loss": 0.07564357668161392, + "L0_d": 614.22, + "MLM loss": 2.004389524459839, + "epoch": 4.77, + "step": 223999 + }, + { + "epoch": 4.77, + "learning_rate": 5.634591836734694e-05, + "loss": 1.9937, + "step": 224000 + }, + { + "FLOPS loss": 0.07761155813932419, + "L0_d": 952.73, + "MLM loss": 1.999483346939087, + "epoch": 4.78, + "step": 224499 + }, + { + "epoch": 4.78, + "learning_rate": 5.624387755102041e-05, + "loss": 2.0023, + "step": 224500 + }, + { + "FLOPS loss": 0.09304012358188629, + "L0_d": 772.3, + "MLM loss": 1.9601857662200928, + "epoch": 4.79, + "step": 224999 + }, + { + "epoch": 4.79, + "learning_rate": 5.614183673469388e-05, + "loss": 1.9928, + "step": 225000 + }, + { + "FLOPS loss": 0.08472666889429092, + "L0_d": 944.25, + "MLM loss": 1.9953516721725464, + "epoch": 4.8, + "step": 225499 + }, + { + "epoch": 4.8, + "learning_rate": 5.6039795918367356e-05, + "loss": 1.9958, + "step": 225500 + }, + { + "FLOPS loss": 0.0648883655667305, + "L0_d": 591.64, + "MLM loss": 1.7785028219223022, + "epoch": 4.81, + "step": 225999 + }, + { + "epoch": 4.81, + "learning_rate": 5.593795918367347e-05, + "loss": 1.9923, + "step": 226000 + }, + { + "FLOPS loss": 0.06218310445547104, + "L0_d": 643.28, + "MLM loss": 1.9573731422424316, + "epoch": 4.82, + "step": 226499 + }, + { + "epoch": 4.82, + "learning_rate": 5.583591836734694e-05, + "loss": 1.9919, + "step": 226500 + }, + { + "FLOPS loss": 0.09281591325998306, + "L0_d": 1149.56, + "MLM loss": 1.9270288944244385, + "epoch": 4.83, + "step": 226999 + }, + { + "epoch": 4.83, + "learning_rate": 5.5733877551020406e-05, + "loss": 1.9941, + "step": 227000 + }, + { + "FLOPS loss": 0.07068472355604172, + "L0_d": 792.38, + "MLM loss": 1.9185221195220947, + "epoch": 4.84, + "step": 227499 + }, + { + "epoch": 4.84, + "learning_rate": 5.563183673469388e-05, + "loss": 1.9916, + "step": 227500 + }, + { + "FLOPS loss": 0.07062963396310806, + "L0_d": 712.59, + "MLM loss": 1.8447917699813843, + "epoch": 4.86, + "step": 227999 + }, + { + "epoch": 4.86, + "learning_rate": 5.552979591836734e-05, + "loss": 1.9943, + "step": 228000 + }, + { + "FLOPS loss": 0.080656997859478, + "L0_d": 760.47, + "MLM loss": 1.926360845565796, + "epoch": 4.87, + "step": 228499 + }, + { + "epoch": 4.87, + "learning_rate": 5.542795918367347e-05, + "loss": 1.9918, + "step": 228500 + }, + { + "FLOPS loss": 0.07454971224069595, + "L0_d": 764.95, + "MLM loss": 1.9017677307128906, + "epoch": 4.88, + "step": 228999 + }, + { + "epoch": 4.88, + "learning_rate": 5.532591836734694e-05, + "loss": 1.9924, + "step": 229000 + }, + { + "FLOPS loss": 0.07176411151885986, + "L0_d": 671.66, + "MLM loss": 2.02742075920105, + "epoch": 4.89, + "step": 229499 + }, + { + "epoch": 4.89, + "learning_rate": 5.5223877551020406e-05, + "loss": 1.9887, + "step": 229500 + }, + { + "FLOPS loss": 0.09042114019393921, + "L0_d": 706.11, + "MLM loss": 1.874399185180664, + "epoch": 4.9, + "step": 229999 + }, + { + "epoch": 4.9, + "learning_rate": 5.512183673469388e-05, + "loss": 1.9916, + "step": 230000 + }, + { + "FLOPS loss": 0.0008960551349446177, + "L0_d": 31998.22, + "MLM loss": 1.7690234184265137, + "epoch": 4.91, + "step": 230499 + }, + { + "epoch": 4.91, + "learning_rate": 5.5020000000000005e-05, + "loss": 1.8658, + "step": 230500 + }, + { + "FLOPS loss": 0.0015128779923543334, + "L0_d": 31847.52, + "MLM loss": 1.8071842193603516, + "epoch": 4.92, + "step": 230999 + }, + { + "epoch": 4.92, + "learning_rate": 5.491795918367347e-05, + "loss": 1.8681, + "step": 231000 + }, + { + "FLOPS loss": 0.002377150347456336, + "L0_d": 29792.11, + "MLM loss": 1.9174680709838867, + "epoch": 4.93, + "step": 231499 + }, + { + "epoch": 4.93, + "learning_rate": 5.481591836734694e-05, + "loss": 1.8658, + "step": 231500 + }, + { + "FLOPS loss": 0.002215802203863859, + "L0_d": 24194.72, + "MLM loss": 1.8142987489700317, + "epoch": 4.94, + "step": 231999 + }, + { + "epoch": 4.94, + "learning_rate": 5.4713877551020406e-05, + "loss": 1.8681, + "step": 232000 + }, + { + "FLOPS loss": 0.002735304646193981, + "L0_d": 20741.98, + "MLM loss": 1.9606852531433105, + "epoch": 4.95, + "step": 232499 + }, + { + "epoch": 4.95, + "learning_rate": 5.461204081632654e-05, + "loss": 1.8704, + "step": 232500 + }, + { + "FLOPS loss": 0.0025493777357041836, + "L0_d": 18399.17, + "MLM loss": 1.8065788745880127, + "epoch": 4.96, + "step": 232999 + }, + { + "epoch": 4.96, + "learning_rate": 5.4510000000000005e-05, + "loss": 1.8718, + "step": 233000 + }, + { + "FLOPS loss": 0.0038267311174422503, + "L0_d": 18764.34, + "MLM loss": 1.9012820720672607, + "epoch": 4.97, + "step": 233499 + }, + { + "epoch": 4.97, + "learning_rate": 5.4407959183673476e-05, + "loss": 1.8723, + "step": 233500 + }, + { + "FLOPS loss": 0.004111337009817362, + "L0_d": 17049.22, + "MLM loss": 1.8614271879196167, + "epoch": 4.98, + "step": 233999 + }, + { + "epoch": 4.98, + "learning_rate": 5.430591836734694e-05, + "loss": 1.8725, + "step": 234000 + }, + { + "FLOPS loss": 0.00438784621655941, + "L0_d": 15735.7, + "MLM loss": 1.8476033210754395, + "epoch": 4.99, + "step": 234499 + }, + { + "epoch": 4.99, + "learning_rate": 5.420387755102041e-05, + "loss": 1.8701, + "step": 234500 + }, + { + "FLOPS loss": 0.00378360110335052, + "L0_d": 12747.81, + "MLM loss": 1.8290207386016846, + "epoch": 5.0, + "step": 234999 + }, + { + "epoch": 5.0, + "learning_rate": 5.410183673469388e-05, + "loss": 1.8713, + "step": 235000 + }, + { + "FLOPS loss": 0.0036748433485627174, + "L0_d": 10985.72, + "MLM loss": 1.953842043876648, + "epoch": 5.02, + "step": 235499 + }, + { + "epoch": 5.02, + "learning_rate": 5.399979591836735e-05, + "loss": 1.8715, + "step": 235500 + }, + { + "FLOPS loss": 0.0055288574658334255, + "L0_d": 12111.08, + "MLM loss": 1.7434651851654053, + "epoch": 5.03, + "step": 235999 + }, + { + "epoch": 5.03, + "learning_rate": 5.3897755102040813e-05, + "loss": 1.8752, + "step": 236000 + }, + { + "FLOPS loss": 0.006115641910582781, + "L0_d": 11654.31, + "MLM loss": 1.8618048429489136, + "epoch": 5.04, + "step": 236499 + }, + { + "epoch": 5.04, + "learning_rate": 5.379591836734694e-05, + "loss": 1.8754, + "step": 236500 + }, + { + "FLOPS loss": 0.00563366012647748, + "L0_d": 10211.41, + "MLM loss": 1.9593673944473267, + "epoch": 5.05, + "step": 236999 + }, + { + "epoch": 5.05, + "learning_rate": 5.369387755102041e-05, + "loss": 1.8715, + "step": 237000 + }, + { + "FLOPS loss": 0.0055515356361866, + "L0_d": 9021.38, + "MLM loss": 1.8693525791168213, + "epoch": 5.06, + "step": 237499 + }, + { + "epoch": 5.06, + "learning_rate": 5.359183673469388e-05, + "loss": 1.8705, + "step": 237500 + }, + { + "FLOPS loss": 0.004694546107202768, + "L0_d": 7627.42, + "MLM loss": 1.9569108486175537, + "epoch": 5.07, + "step": 237999 + }, + { + "epoch": 5.07, + "learning_rate": 5.348979591836735e-05, + "loss": 1.8742, + "step": 238000 + }, + { + "FLOPS loss": 0.0069424486719071865, + "L0_d": 8976.55, + "MLM loss": 1.8137693405151367, + "epoch": 5.08, + "step": 238499 + }, + { + "epoch": 5.08, + "learning_rate": 5.3387959183673476e-05, + "loss": 1.872, + "step": 238500 + }, + { + "FLOPS loss": 0.00838744267821312, + "L0_d": 8812.91, + "MLM loss": 1.7488086223602295, + "epoch": 5.09, + "step": 238999 + }, + { + "epoch": 5.09, + "learning_rate": 5.328612244897959e-05, + "loss": 1.8758, + "step": 239000 + }, + { + "FLOPS loss": 0.00693407841026783, + "L0_d": 6661.42, + "MLM loss": 1.987975835800171, + "epoch": 5.1, + "step": 239499 + }, + { + "epoch": 5.1, + "learning_rate": 5.318408163265306e-05, + "loss": 1.8779, + "step": 239500 + }, + { + "FLOPS loss": 0.007485151290893555, + "L0_d": 6464.69, + "MLM loss": 1.8088548183441162, + "epoch": 5.11, + "step": 239999 + }, + { + "epoch": 5.11, + "learning_rate": 5.3082040816326526e-05, + "loss": 1.875, + "step": 240000 + }, + { + "FLOPS loss": 0.008187003433704376, + "L0_d": 5961.67, + "MLM loss": 1.8100885152816772, + "epoch": 5.12, + "step": 240499 + }, + { + "epoch": 5.12, + "learning_rate": 5.2980000000000004e-05, + "loss": 1.8735, + "step": 240500 + }, + { + "FLOPS loss": 0.008060183376073837, + "L0_d": 5845.19, + "MLM loss": 1.8168284893035889, + "epoch": 5.13, + "step": 240999 + }, + { + "epoch": 5.13, + "learning_rate": 5.2877959183673476e-05, + "loss": 1.8771, + "step": 241000 + }, + { + "FLOPS loss": 0.010463972575962543, + "L0_d": 6205.86, + "MLM loss": 1.7326014041900635, + "epoch": 5.14, + "step": 241499 + }, + { + "epoch": 5.14, + "learning_rate": 5.277591836734694e-05, + "loss": 1.8801, + "step": 241500 + }, + { + "FLOPS loss": 0.011188147589564323, + "L0_d": 6291.0, + "MLM loss": 1.9021883010864258, + "epoch": 5.15, + "step": 241999 + }, + { + "epoch": 5.15, + "learning_rate": 5.267408163265306e-05, + "loss": 1.8799, + "step": 242000 + }, + { + "FLOPS loss": 0.013200161047279835, + "L0_d": 7156.77, + "MLM loss": 1.6133452653884888, + "epoch": 5.16, + "step": 242499 + }, + { + "epoch": 5.16, + "learning_rate": 5.2572040816326526e-05, + "loss": 1.8787, + "step": 242500 + }, + { + "FLOPS loss": 0.013426079414784908, + "L0_d": 5465.06, + "MLM loss": 1.911632776260376, + "epoch": 5.17, + "step": 242999 + }, + { + "epoch": 5.17, + "learning_rate": 5.247000000000001e-05, + "loss": 1.879, + "step": 243000 + }, + { + "FLOPS loss": 0.010374533012509346, + "L0_d": 4300.98, + "MLM loss": 1.865431547164917, + "epoch": 5.19, + "step": 243499 + }, + { + "epoch": 5.19, + "learning_rate": 5.2367959183673476e-05, + "loss": 1.8795, + "step": 243500 + }, + { + "FLOPS loss": 0.014977757818996906, + "L0_d": 4963.09, + "MLM loss": 1.8555488586425781, + "epoch": 5.2, + "step": 243999 + }, + { + "epoch": 5.2, + "learning_rate": 5.226591836734695e-05, + "loss": 1.8808, + "step": 244000 + }, + { + "FLOPS loss": 0.010343066416680813, + "L0_d": 3994.11, + "MLM loss": 1.9097082614898682, + "epoch": 5.21, + "step": 244499 + }, + { + "epoch": 5.21, + "learning_rate": 5.216387755102041e-05, + "loss": 1.8859, + "step": 244500 + }, + { + "FLOPS loss": 0.014617117121815681, + "L0_d": 4153.5, + "MLM loss": 1.8086098432540894, + "epoch": 5.22, + "step": 244999 + }, + { + "epoch": 5.22, + "learning_rate": 5.206204081632653e-05, + "loss": 1.8823, + "step": 245000 + }, + { + "FLOPS loss": 0.013393222354352474, + "L0_d": 3711.11, + "MLM loss": 1.910830020904541, + "epoch": 5.23, + "step": 245499 + }, + { + "epoch": 5.23, + "learning_rate": 5.196e-05, + "loss": 1.8811, + "step": 245500 + }, + { + "FLOPS loss": 0.014843770302832127, + "L0_d": 3604.33, + "MLM loss": 1.838127851486206, + "epoch": 5.24, + "step": 245999 + }, + { + "epoch": 5.24, + "learning_rate": 5.1857959183673475e-05, + "loss": 1.8852, + "step": 246000 + }, + { + "FLOPS loss": 0.019269835203886032, + "L0_d": 4254.81, + "MLM loss": 1.8526206016540527, + "epoch": 5.25, + "step": 246499 + }, + { + "epoch": 5.25, + "learning_rate": 5.175591836734695e-05, + "loss": 1.8886, + "step": 246500 + }, + { + "FLOPS loss": 0.01963115483522415, + "L0_d": 3850.11, + "MLM loss": 2.0099689960479736, + "epoch": 5.26, + "step": 246999 + }, + { + "epoch": 5.26, + "learning_rate": 5.165387755102041e-05, + "loss": 1.8854, + "step": 247000 + }, + { + "FLOPS loss": 0.016454359516501427, + "L0_d": 3115.81, + "MLM loss": 1.9626414775848389, + "epoch": 5.27, + "step": 247499 + }, + { + "epoch": 5.27, + "learning_rate": 5.155204081632653e-05, + "loss": 1.8857, + "step": 247500 + }, + { + "FLOPS loss": 0.020083287730813026, + "L0_d": 3520.73, + "MLM loss": 1.9162139892578125, + "epoch": 5.28, + "step": 247999 + }, + { + "epoch": 5.28, + "learning_rate": 5.145e-05, + "loss": 1.8854, + "step": 248000 + }, + { + "FLOPS loss": 0.0178839061409235, + "L0_d": 2936.16, + "MLM loss": 1.8663880825042725, + "epoch": 5.29, + "step": 248499 + }, + { + "epoch": 5.29, + "learning_rate": 5.134795918367347e-05, + "loss": 1.8929, + "step": 248500 + }, + { + "FLOPS loss": 0.023697547614574432, + "L0_d": 3550.75, + "MLM loss": 1.9941072463989258, + "epoch": 5.3, + "step": 248999 + }, + { + "epoch": 5.3, + "learning_rate": 5.124591836734695e-05, + "loss": 1.8885, + "step": 249000 + }, + { + "FLOPS loss": 0.020585505291819572, + "L0_d": 2454.28, + "MLM loss": 1.7720873355865479, + "epoch": 5.31, + "step": 249499 + }, + { + "epoch": 5.31, + "learning_rate": 5.114387755102041e-05, + "loss": 1.8903, + "step": 249500 + }, + { + "FLOPS loss": 0.022391222417354584, + "L0_d": 3282.28, + "MLM loss": 1.873108148574829, + "epoch": 5.32, + "step": 249999 + }, + { + "epoch": 5.32, + "learning_rate": 5.104204081632653e-05, + "loss": 1.8896, + "step": 250000 + }, + { + "FLOPS loss": 0.02332943119108677, + "L0_d": 2424.98, + "MLM loss": 2.0432984828948975, + "epoch": 5.33, + "step": 250499 + }, + { + "epoch": 5.33, + "learning_rate": 5.094e-05, + "loss": 1.8887, + "step": 250500 + }, + { + "FLOPS loss": 0.02283013053238392, + "L0_d": 2545.16, + "MLM loss": 1.946418046951294, + "epoch": 5.35, + "step": 250999 + }, + { + "epoch": 5.35, + "learning_rate": 5.083795918367347e-05, + "loss": 1.8915, + "step": 251000 + }, + { + "FLOPS loss": 0.027263039723038673, + "L0_d": 3158.34, + "MLM loss": 2.0646772384643555, + "epoch": 5.36, + "step": 251499 + }, + { + "epoch": 5.36, + "learning_rate": 5.0735918367346947e-05, + "loss": 1.8907, + "step": 251500 + }, + { + "FLOPS loss": 0.026302088052034378, + "L0_d": 3060.11, + "MLM loss": 1.7769654989242554, + "epoch": 5.37, + "step": 251999 + }, + { + "epoch": 5.37, + "learning_rate": 5.063387755102041e-05, + "loss": 1.8918, + "step": 252000 + }, + { + "FLOPS loss": 0.026689713820815086, + "L0_d": 2597.42, + "MLM loss": 2.0476834774017334, + "epoch": 5.38, + "step": 252499 + }, + { + "epoch": 5.38, + "learning_rate": 5.053183673469388e-05, + "loss": 1.8961, + "step": 252500 + }, + { + "FLOPS loss": 0.029723016545176506, + "L0_d": 3018.95, + "MLM loss": 1.852935791015625, + "epoch": 5.39, + "step": 252999 + }, + { + "epoch": 5.39, + "learning_rate": 5.042979591836735e-05, + "loss": 1.8928, + "step": 253000 + }, + { + "FLOPS loss": 0.025388535112142563, + "L0_d": 1884.11, + "MLM loss": 1.8686188459396362, + "epoch": 5.4, + "step": 253499 + }, + { + "epoch": 5.4, + "learning_rate": 5.032775510204082e-05, + "loss": 1.8976, + "step": 253500 + }, + { + "FLOPS loss": 0.032775864005088806, + "L0_d": 2505.75, + "MLM loss": 1.8625152111053467, + "epoch": 5.41, + "step": 253999 + }, + { + "epoch": 5.41, + "learning_rate": 5.0226122448979596e-05, + "loss": 1.8995, + "step": 254000 + }, + { + "FLOPS loss": 0.030364518985152245, + "L0_d": 2600.05, + "MLM loss": 1.926785945892334, + "epoch": 5.42, + "step": 254499 + }, + { + "epoch": 5.42, + "learning_rate": 5.012408163265307e-05, + "loss": 1.905, + "step": 254500 + }, + { + "FLOPS loss": 0.030143748968839645, + "L0_d": 2478.72, + "MLM loss": 1.982461929321289, + "epoch": 5.43, + "step": 254999 + }, + { + "epoch": 5.43, + "learning_rate": 5.002204081632653e-05, + "loss": 1.8977, + "step": 255000 + }, + { + "FLOPS loss": 0.03670482337474823, + "L0_d": 2263.52, + "MLM loss": 1.7775113582611084, + "epoch": 5.44, + "step": 255499 + }, + { + "epoch": 5.44, + "learning_rate": 4.992e-05, + "loss": 1.8996, + "step": 255500 + }, + { + "FLOPS loss": 0.03422041982412338, + "L0_d": 2109.53, + "MLM loss": 1.787998914718628, + "epoch": 5.45, + "step": 255999 + }, + { + "epoch": 5.45, + "learning_rate": 4.9817959183673475e-05, + "loss": 1.9009, + "step": 256000 + }, + { + "FLOPS loss": 0.03274346888065338, + "L0_d": 2421.89, + "MLM loss": 1.9225132465362549, + "epoch": 5.46, + "step": 256499 + }, + { + "epoch": 5.46, + "learning_rate": 4.971591836734694e-05, + "loss": 1.9031, + "step": 256500 + }, + { + "FLOPS loss": 0.04315318539738655, + "L0_d": 2542.11, + "MLM loss": 1.941856026649475, + "epoch": 5.47, + "step": 256999 + }, + { + "epoch": 5.47, + "learning_rate": 4.961387755102041e-05, + "loss": 1.9076, + "step": 257000 + }, + { + "FLOPS loss": 0.03899309039115906, + "L0_d": 1758.94, + "MLM loss": 1.8869366645812988, + "epoch": 5.48, + "step": 257499 + }, + { + "epoch": 5.48, + "learning_rate": 4.951204081632653e-05, + "loss": 1.9053, + "step": 257500 + }, + { + "FLOPS loss": 0.04414968192577362, + "L0_d": 2086.28, + "MLM loss": 1.936894416809082, + "epoch": 5.49, + "step": 257999 + }, + { + "epoch": 5.49, + "learning_rate": 4.941e-05, + "loss": 1.9087, + "step": 258000 + }, + { + "FLOPS loss": 0.03893755003809929, + "L0_d": 2211.0, + "MLM loss": 1.8997561931610107, + "epoch": 5.5, + "step": 258499 + }, + { + "epoch": 5.5, + "learning_rate": 4.9307959183673474e-05, + "loss": 1.9008, + "step": 258500 + }, + { + "FLOPS loss": 0.039572618901729584, + "L0_d": 1918.89, + "MLM loss": 1.776296854019165, + "epoch": 5.52, + "step": 258999 + }, + { + "epoch": 5.52, + "learning_rate": 4.9205918367346946e-05, + "loss": 1.9048, + "step": 259000 + }, + { + "FLOPS loss": 0.04069853574037552, + "L0_d": 1673.44, + "MLM loss": 1.9040732383728027, + "epoch": 5.53, + "step": 259499 + }, + { + "epoch": 5.53, + "learning_rate": 4.9104081632653067e-05, + "loss": 1.9065, + "step": 259500 + }, + { + "FLOPS loss": 0.04161301255226135, + "L0_d": 1786.67, + "MLM loss": 1.7740488052368164, + "epoch": 5.54, + "step": 259999 + }, + { + "epoch": 5.54, + "learning_rate": 4.900204081632653e-05, + "loss": 1.9154, + "step": 260000 + }, + { + "FLOPS loss": 0.03743300214409828, + "L0_d": 1616.17, + "MLM loss": 1.9160172939300537, + "epoch": 5.55, + "step": 260499 + }, + { + "epoch": 5.55, + "learning_rate": 4.89e-05, + "loss": 1.9111, + "step": 260500 + }, + { + "FLOPS loss": 0.04368520900607109, + "L0_d": 1466.02, + "MLM loss": 2.037893772125244, + "epoch": 5.56, + "step": 260999 + }, + { + "epoch": 5.56, + "learning_rate": 4.879795918367347e-05, + "loss": 1.9109, + "step": 261000 + }, + { + "FLOPS loss": 0.04595090448856354, + "L0_d": 1555.14, + "MLM loss": 1.767559289932251, + "epoch": 5.57, + "step": 261499 + }, + { + "epoch": 5.57, + "learning_rate": 4.8695918367346946e-05, + "loss": 1.9161, + "step": 261500 + }, + { + "FLOPS loss": 0.04404296353459358, + "L0_d": 1743.38, + "MLM loss": 1.9173357486724854, + "epoch": 5.58, + "step": 261999 + }, + { + "epoch": 5.58, + "learning_rate": 4.859387755102041e-05, + "loss": 1.9176, + "step": 262000 + }, + { + "FLOPS loss": 0.04245808348059654, + "L0_d": 1125.33, + "MLM loss": 1.6865381002426147, + "epoch": 5.59, + "step": 262499 + }, + { + "epoch": 5.59, + "learning_rate": 4.849183673469388e-05, + "loss": 1.9178, + "step": 262500 + }, + { + "FLOPS loss": 0.04385161027312279, + "L0_d": 1418.67, + "MLM loss": 1.9055112600326538, + "epoch": 5.6, + "step": 262999 + }, + { + "epoch": 5.6, + "learning_rate": 4.8389795918367347e-05, + "loss": 1.9131, + "step": 263000 + }, + { + "FLOPS loss": 0.053213585168123245, + "L0_d": 2183.94, + "MLM loss": 1.9083770513534546, + "epoch": 5.61, + "step": 263499 + }, + { + "epoch": 5.61, + "learning_rate": 4.828795918367347e-05, + "loss": 1.9159, + "step": 263500 + }, + { + "FLOPS loss": 0.05162021145224571, + "L0_d": 1506.91, + "MLM loss": 1.8558990955352783, + "epoch": 5.62, + "step": 263999 + }, + { + "epoch": 5.62, + "learning_rate": 4.818591836734694e-05, + "loss": 1.9171, + "step": 264000 + }, + { + "FLOPS loss": 0.05895388498902321, + "L0_d": 1319.53, + "MLM loss": 1.7792572975158691, + "epoch": 5.63, + "step": 264499 + }, + { + "epoch": 5.63, + "learning_rate": 4.808387755102041e-05, + "loss": 1.9188, + "step": 264500 + }, + { + "FLOPS loss": 0.06398754566907883, + "L0_d": 1367.34, + "MLM loss": 1.8091468811035156, + "epoch": 5.64, + "step": 264999 + }, + { + "epoch": 5.64, + "learning_rate": 4.798183673469388e-05, + "loss": 1.9174, + "step": 265000 + }, + { + "FLOPS loss": 0.05075083673000336, + "L0_d": 1379.28, + "MLM loss": 1.8456506729125977, + "epoch": 5.65, + "step": 265499 + }, + { + "epoch": 5.65, + "learning_rate": 4.7879795918367346e-05, + "loss": 1.9232, + "step": 265500 + }, + { + "FLOPS loss": 0.0504881925880909, + "L0_d": 1447.92, + "MLM loss": 1.9336979389190674, + "epoch": 5.66, + "step": 265999 + }, + { + "epoch": 5.66, + "learning_rate": 4.7777959183673474e-05, + "loss": 1.9207, + "step": 266000 + }, + { + "FLOPS loss": 0.06466767936944962, + "L0_d": 1269.03, + "MLM loss": 1.8459978103637695, + "epoch": 5.68, + "step": 266499 + }, + { + "epoch": 5.68, + "learning_rate": 4.767591836734694e-05, + "loss": 1.9252, + "step": 266500 + }, + { + "FLOPS loss": 0.055939216166734695, + "L0_d": 1076.92, + "MLM loss": 1.9006898403167725, + "epoch": 5.69, + "step": 266999 + }, + { + "epoch": 5.69, + "learning_rate": 4.757387755102041e-05, + "loss": 1.9256, + "step": 267000 + }, + { + "FLOPS loss": 0.06076573580503464, + "L0_d": 1341.17, + "MLM loss": 1.926182508468628, + "epoch": 5.7, + "step": 267499 + }, + { + "epoch": 5.7, + "learning_rate": 4.747183673469388e-05, + "loss": 1.9289, + "step": 267500 + }, + { + "FLOPS loss": 0.054936155676841736, + "L0_d": 1288.95, + "MLM loss": 1.973268985748291, + "epoch": 5.71, + "step": 267999 + }, + { + "epoch": 5.71, + "learning_rate": 4.7369795918367346e-05, + "loss": 1.9284, + "step": 268000 + }, + { + "FLOPS loss": 0.06445150822401047, + "L0_d": 1231.92, + "MLM loss": 2.0671331882476807, + "epoch": 5.72, + "step": 268499 + }, + { + "epoch": 5.72, + "learning_rate": 4.7267959183673474e-05, + "loss": 1.9293, + "step": 268500 + }, + { + "FLOPS loss": 0.06608186662197113, + "L0_d": 1103.44, + "MLM loss": 1.9241859912872314, + "epoch": 5.73, + "step": 268999 + }, + { + "epoch": 5.73, + "learning_rate": 4.716591836734694e-05, + "loss": 1.9299, + "step": 269000 + }, + { + "FLOPS loss": 0.05536879971623421, + "L0_d": 945.16, + "MLM loss": 1.7656147480010986, + "epoch": 5.74, + "step": 269499 + }, + { + "epoch": 5.74, + "learning_rate": 4.706387755102041e-05, + "loss": 1.9338, + "step": 269500 + }, + { + "FLOPS loss": 0.06108957156538963, + "L0_d": 1119.91, + "MLM loss": 1.729440689086914, + "epoch": 5.75, + "step": 269999 + }, + { + "epoch": 5.75, + "learning_rate": 4.696183673469388e-05, + "loss": 1.9344, + "step": 270000 + }, + { + "FLOPS loss": 0.05857192724943161, + "L0_d": 900.25, + "MLM loss": 2.0199506282806396, + "epoch": 5.76, + "step": 270499 + }, + { + "epoch": 5.76, + "learning_rate": 4.686e-05, + "loss": 1.9318, + "step": 270500 + }, + { + "FLOPS loss": 0.06669124215841293, + "L0_d": 1039.64, + "MLM loss": 1.8732706308364868, + "epoch": 5.77, + "step": 270999 + }, + { + "epoch": 5.77, + "learning_rate": 4.6757959183673473e-05, + "loss": 1.936, + "step": 271000 + }, + { + "FLOPS loss": 0.06447934359312057, + "L0_d": 1104.81, + "MLM loss": 1.7664421796798706, + "epoch": 5.78, + "step": 271499 + }, + { + "epoch": 5.78, + "learning_rate": 4.665591836734694e-05, + "loss": 1.9361, + "step": 271500 + }, + { + "FLOPS loss": 0.07491941750049591, + "L0_d": 1102.53, + "MLM loss": 1.9895565509796143, + "epoch": 5.79, + "step": 271999 + }, + { + "epoch": 5.79, + "learning_rate": 4.655387755102041e-05, + "loss": 1.9386, + "step": 272000 + }, + { + "FLOPS loss": 0.06251773238182068, + "L0_d": 894.05, + "MLM loss": 1.930220365524292, + "epoch": 5.8, + "step": 272499 + }, + { + "epoch": 5.8, + "learning_rate": 4.645183673469388e-05, + "loss": 1.9419, + "step": 272500 + }, + { + "FLOPS loss": 0.06176398694515228, + "L0_d": 843.98, + "MLM loss": 1.6715091466903687, + "epoch": 5.81, + "step": 272999 + }, + { + "epoch": 5.81, + "learning_rate": 4.634979591836735e-05, + "loss": 1.9398, + "step": 273000 + }, + { + "FLOPS loss": 0.07493959367275238, + "L0_d": 864.36, + "MLM loss": 2.028404474258423, + "epoch": 5.82, + "step": 273499 + }, + { + "epoch": 5.82, + "learning_rate": 4.624795918367347e-05, + "loss": 1.9426, + "step": 273500 + }, + { + "FLOPS loss": 0.05773789435625076, + "L0_d": 694.45, + "MLM loss": 1.8639841079711914, + "epoch": 5.83, + "step": 273999 + }, + { + "epoch": 5.84, + "learning_rate": 4.614591836734694e-05, + "loss": 1.944, + "step": 274000 + }, + { + "FLOPS loss": 0.0750850960612297, + "L0_d": 890.12, + "MLM loss": 1.9028539657592773, + "epoch": 5.85, + "step": 274499 + }, + { + "epoch": 5.85, + "learning_rate": 4.604387755102041e-05, + "loss": 1.9442, + "step": 274500 + }, + { + "FLOPS loss": 0.07717510312795639, + "L0_d": 1054.8, + "MLM loss": 1.8612613677978516, + "epoch": 5.86, + "step": 274999 + }, + { + "epoch": 5.86, + "learning_rate": 4.5941836734693874e-05, + "loss": 1.9462, + "step": 275000 + }, + { + "FLOPS loss": 0.07143872231245041, + "L0_d": 700.31, + "MLM loss": 1.8450987339019775, + "epoch": 5.87, + "step": 275499 + }, + { + "epoch": 5.87, + "learning_rate": 4.583979591836735e-05, + "loss": 1.9472, + "step": 275500 + }, + { + "FLOPS loss": 0.08762510865926743, + "L0_d": 987.81, + "MLM loss": 1.8697837591171265, + "epoch": 5.88, + "step": 275999 + }, + { + "epoch": 5.88, + "learning_rate": 4.573795918367347e-05, + "loss": 1.9489, + "step": 276000 + }, + { + "FLOPS loss": 0.06401695311069489, + "L0_d": 813.38, + "MLM loss": 1.9566137790679932, + "epoch": 5.89, + "step": 276499 + }, + { + "epoch": 5.89, + "learning_rate": 4.5635918367346945e-05, + "loss": 1.9506, + "step": 276500 + }, + { + "FLOPS loss": 0.06458479911088943, + "L0_d": 561.38, + "MLM loss": 1.8860130310058594, + "epoch": 5.9, + "step": 276999 + }, + { + "epoch": 5.9, + "learning_rate": 4.553387755102041e-05, + "loss": 1.9504, + "step": 277000 + }, + { + "FLOPS loss": 0.06730065494775772, + "L0_d": 896.34, + "MLM loss": 1.893805980682373, + "epoch": 5.91, + "step": 277499 + }, + { + "epoch": 5.91, + "learning_rate": 4.543183673469388e-05, + "loss": 1.954, + "step": 277500 + }, + { + "FLOPS loss": 0.08798781037330627, + "L0_d": 1083.72, + "MLM loss": 1.9584965705871582, + "epoch": 5.92, + "step": 277999 + }, + { + "epoch": 5.92, + "learning_rate": 4.533e-05, + "loss": 1.9502, + "step": 278000 + }, + { + "FLOPS loss": 0.07679998129606247, + "L0_d": 1217.53, + "MLM loss": 1.9031168222427368, + "epoch": 5.93, + "step": 278499 + }, + { + "epoch": 5.93, + "learning_rate": 4.522795918367347e-05, + "loss": 1.9538, + "step": 278500 + }, + { + "FLOPS loss": 0.06088661774992943, + "L0_d": 524.78, + "MLM loss": 1.8549163341522217, + "epoch": 5.94, + "step": 278999 + }, + { + "epoch": 5.94, + "learning_rate": 4.5125918367346944e-05, + "loss": 1.9559, + "step": 279000 + }, + { + "FLOPS loss": 0.0638987198472023, + "L0_d": 638.92, + "MLM loss": 2.013615131378174, + "epoch": 5.95, + "step": 279499 + }, + { + "epoch": 5.95, + "learning_rate": 4.502387755102041e-05, + "loss": 1.9549, + "step": 279500 + }, + { + "FLOPS loss": 0.10002320259809494, + "L0_d": 722.52, + "MLM loss": 1.861433506011963, + "epoch": 5.96, + "step": 279999 + }, + { + "epoch": 5.96, + "learning_rate": 4.492183673469388e-05, + "loss": 1.9545, + "step": 280000 + }, + { + "FLOPS loss": 0.06846465915441513, + "L0_d": 619.73, + "MLM loss": 2.04079270362854, + "epoch": 5.97, + "step": 280499 + }, + { + "epoch": 5.97, + "learning_rate": 4.482e-05, + "loss": 1.9575, + "step": 280500 + }, + { + "FLOPS loss": 0.08104480803012848, + "L0_d": 706.91, + "MLM loss": 1.932988166809082, + "epoch": 5.98, + "step": 280999 + }, + { + "epoch": 5.98, + "learning_rate": 4.4717959183673466e-05, + "loss": 1.956, + "step": 281000 + }, + { + "FLOPS loss": 0.06805707514286041, + "L0_d": 595.69, + "MLM loss": 1.7187745571136475, + "epoch": 5.99, + "step": 281499 + }, + { + "epoch": 5.99, + "learning_rate": 4.4615918367346944e-05, + "loss": 1.9531, + "step": 281500 + }, + { + "FLOPS loss": 0.07271081209182739, + "L0_d": 627.72, + "MLM loss": 1.9254471063613892, + "epoch": 6.01, + "step": 281999 + }, + { + "epoch": 6.01, + "learning_rate": 4.451387755102041e-05, + "loss": 1.9552, + "step": 282000 + }, + { + "FLOPS loss": 0.08167015016078949, + "L0_d": 696.19, + "MLM loss": 2.0464000701904297, + "epoch": 6.02, + "step": 282499 + }, + { + "epoch": 6.02, + "learning_rate": 4.441183673469388e-05, + "loss": 1.9541, + "step": 282500 + }, + { + "FLOPS loss": 0.08481060713529587, + "L0_d": 886.92, + "MLM loss": 1.9736199378967285, + "epoch": 6.03, + "step": 282999 + }, + { + "epoch": 6.03, + "learning_rate": 4.431e-05, + "loss": 1.9535, + "step": 283000 + }, + { + "FLOPS loss": 0.06780994683504105, + "L0_d": 677.19, + "MLM loss": 1.9477245807647705, + "epoch": 6.04, + "step": 283499 + }, + { + "epoch": 6.04, + "learning_rate": 4.420795918367347e-05, + "loss": 1.9543, + "step": 283500 + }, + { + "FLOPS loss": 0.07411380112171173, + "L0_d": 743.67, + "MLM loss": 2.0674924850463867, + "epoch": 6.05, + "step": 283999 + }, + { + "epoch": 6.05, + "learning_rate": 4.4105918367346944e-05, + "loss": 1.9532, + "step": 284000 + }, + { + "FLOPS loss": 0.07907848060131073, + "L0_d": 974.5, + "MLM loss": 1.780951738357544, + "epoch": 6.06, + "step": 284499 + }, + { + "epoch": 6.06, + "learning_rate": 4.400387755102041e-05, + "loss": 1.9521, + "step": 284500 + }, + { + "FLOPS loss": 0.07107169926166534, + "L0_d": 681.83, + "MLM loss": 1.8572118282318115, + "epoch": 6.07, + "step": 284999 + }, + { + "epoch": 6.07, + "learning_rate": 4.390183673469388e-05, + "loss": 1.9522, + "step": 285000 + }, + { + "FLOPS loss": 0.06437213718891144, + "L0_d": 544.11, + "MLM loss": 1.940516471862793, + "epoch": 6.08, + "step": 285499 + }, + { + "epoch": 6.08, + "learning_rate": 4.38e-05, + "loss": 1.9512, + "step": 285500 + }, + { + "FLOPS loss": 0.08128892630338669, + "L0_d": 735.03, + "MLM loss": 1.9186279773712158, + "epoch": 6.09, + "step": 285999 + }, + { + "epoch": 6.09, + "learning_rate": 4.369795918367347e-05, + "loss": 1.9538, + "step": 286000 + }, + { + "FLOPS loss": 0.08808960020542145, + "L0_d": 1024.83, + "MLM loss": 1.988155722618103, + "epoch": 6.1, + "step": 286499 + }, + { + "epoch": 6.1, + "learning_rate": 4.359591836734694e-05, + "loss": 1.9537, + "step": 286500 + }, + { + "FLOPS loss": 0.07475665211677551, + "L0_d": 571.81, + "MLM loss": 1.839475393295288, + "epoch": 6.11, + "step": 286999 + }, + { + "epoch": 6.11, + "learning_rate": 4.3493877551020415e-05, + "loss": 1.9508, + "step": 287000 + }, + { + "FLOPS loss": 0.06577183306217194, + "L0_d": 429.23, + "MLM loss": 1.8133153915405273, + "epoch": 6.12, + "step": 287499 + }, + { + "epoch": 6.12, + "learning_rate": 4.3392040816326536e-05, + "loss": 1.9493, + "step": 287500 + }, + { + "FLOPS loss": 0.07787065207958221, + "L0_d": 620.91, + "MLM loss": 1.8850531578063965, + "epoch": 6.13, + "step": 287999 + }, + { + "epoch": 6.13, + "learning_rate": 4.329e-05, + "loss": 1.9523, + "step": 288000 + }, + { + "FLOPS loss": 0.07825678586959839, + "L0_d": 962.11, + "MLM loss": 1.900402545928955, + "epoch": 6.14, + "step": 288499 + }, + { + "epoch": 6.14, + "learning_rate": 4.318795918367347e-05, + "loss": 1.9498, + "step": 288500 + }, + { + "FLOPS loss": 0.06425338983535767, + "L0_d": 524.34, + "MLM loss": 1.85397469997406, + "epoch": 6.15, + "step": 288999 + }, + { + "epoch": 6.15, + "learning_rate": 4.308591836734694e-05, + "loss": 1.9492, + "step": 289000 + }, + { + "FLOPS loss": 0.08077821135520935, + "L0_d": 669.0, + "MLM loss": 2.0354411602020264, + "epoch": 6.17, + "step": 289499 + }, + { + "epoch": 6.17, + "learning_rate": 4.2983877551020415e-05, + "loss": 1.9522, + "step": 289500 + }, + { + "FLOPS loss": 0.07401479035615921, + "L0_d": 786.83, + "MLM loss": 1.9391406774520874, + "epoch": 6.18, + "step": 289999 + }, + { + "epoch": 6.18, + "learning_rate": 4.2882040816326536e-05, + "loss": 1.9523, + "step": 290000 + }, + { + "FLOPS loss": 0.06442100554704666, + "L0_d": 726.97, + "MLM loss": 1.934419870376587, + "epoch": 6.19, + "step": 290499 + }, + { + "epoch": 6.19, + "learning_rate": 4.278e-05, + "loss": 1.947, + "step": 290500 + }, + { + "FLOPS loss": 0.07297225296497345, + "L0_d": 774.31, + "MLM loss": 1.890745997428894, + "epoch": 6.2, + "step": 290999 + }, + { + "epoch": 6.2, + "learning_rate": 4.267795918367347e-05, + "loss": 1.9549, + "step": 291000 + }, + { + "FLOPS loss": 0.07912027090787888, + "L0_d": 898.61, + "MLM loss": 2.038191318511963, + "epoch": 6.21, + "step": 291499 + }, + { + "epoch": 6.21, + "learning_rate": 4.2575918367346937e-05, + "loss": 1.9493, + "step": 291500 + }, + { + "FLOPS loss": 0.0650862529873848, + "L0_d": 768.06, + "MLM loss": 1.9675188064575195, + "epoch": 6.22, + "step": 291999 + }, + { + "epoch": 6.22, + "learning_rate": 4.2474081632653064e-05, + "loss": 1.9483, + "step": 292000 + }, + { + "FLOPS loss": 0.07283622026443481, + "L0_d": 700.41, + "MLM loss": 1.8989489078521729, + "epoch": 6.23, + "step": 292499 + }, + { + "epoch": 6.23, + "learning_rate": 4.237204081632653e-05, + "loss": 1.9513, + "step": 292500 + }, + { + "FLOPS loss": 0.06748488545417786, + "L0_d": 741.77, + "MLM loss": 1.887213945388794, + "epoch": 6.24, + "step": 292999 + }, + { + "epoch": 6.24, + "learning_rate": 4.227000000000001e-05, + "loss": 1.9479, + "step": 293000 + }, + { + "FLOPS loss": 0.07973089814186096, + "L0_d": 664.5, + "MLM loss": 2.0132598876953125, + "epoch": 6.25, + "step": 293499 + }, + { + "epoch": 6.25, + "learning_rate": 4.216795918367347e-05, + "loss": 1.9496, + "step": 293500 + }, + { + "FLOPS loss": 0.07241816818714142, + "L0_d": 707.08, + "MLM loss": 1.7604951858520508, + "epoch": 6.26, + "step": 293999 + }, + { + "epoch": 6.26, + "learning_rate": 4.206612244897959e-05, + "loss": 1.9465, + "step": 294000 + }, + { + "FLOPS loss": 0.07850213348865509, + "L0_d": 698.77, + "MLM loss": 1.9729360342025757, + "epoch": 6.27, + "step": 294499 + }, + { + "epoch": 6.27, + "learning_rate": 4.1964081632653064e-05, + "loss": 1.9481, + "step": 294500 + }, + { + "FLOPS loss": 0.08519662916660309, + "L0_d": 785.38, + "MLM loss": 1.8330121040344238, + "epoch": 6.28, + "step": 294999 + }, + { + "epoch": 6.28, + "learning_rate": 4.186204081632653e-05, + "loss": 1.9431, + "step": 295000 + }, + { + "FLOPS loss": 0.05845937877893448, + "L0_d": 621.36, + "MLM loss": 1.9399977922439575, + "epoch": 6.29, + "step": 295499 + }, + { + "epoch": 6.29, + "learning_rate": 4.176000000000001e-05, + "loss": 1.947, + "step": 295500 + }, + { + "FLOPS loss": 0.06915000826120377, + "L0_d": 650.09, + "MLM loss": 1.8091906309127808, + "epoch": 6.3, + "step": 295999 + }, + { + "epoch": 6.3, + "learning_rate": 4.165816326530613e-05, + "loss": 1.9482, + "step": 296000 + }, + { + "FLOPS loss": 0.07740022242069244, + "L0_d": 687.33, + "MLM loss": 1.9560225009918213, + "epoch": 6.31, + "step": 296499 + }, + { + "epoch": 6.31, + "learning_rate": 4.155612244897959e-05, + "loss": 1.9488, + "step": 296500 + }, + { + "FLOPS loss": 0.08005591481924057, + "L0_d": 899.48, + "MLM loss": 1.7681009769439697, + "epoch": 6.32, + "step": 296999 + }, + { + "epoch": 6.32, + "learning_rate": 4.1454081632653064e-05, + "loss": 1.9483, + "step": 297000 + }, + { + "FLOPS loss": 0.06122187525033951, + "L0_d": 580.2, + "MLM loss": 1.9148626327514648, + "epoch": 6.34, + "step": 297499 + }, + { + "epoch": 6.34, + "learning_rate": 4.135204081632653e-05, + "loss": 1.9475, + "step": 297500 + }, + { + "FLOPS loss": 0.07536512613296509, + "L0_d": 643.31, + "MLM loss": 1.77424156665802, + "epoch": 6.35, + "step": 297999 + }, + { + "epoch": 6.35, + "learning_rate": 4.125e-05, + "loss": 1.9466, + "step": 298000 + }, + { + "FLOPS loss": 0.0731494352221489, + "L0_d": 579.67, + "MLM loss": 1.8063685894012451, + "epoch": 6.36, + "step": 298499 + }, + { + "epoch": 6.36, + "learning_rate": 4.114816326530612e-05, + "loss": 1.9478, + "step": 298500 + }, + { + "FLOPS loss": 0.09138891100883484, + "L0_d": 789.53, + "MLM loss": 1.8201489448547363, + "epoch": 6.37, + "step": 298999 + }, + { + "epoch": 6.37, + "learning_rate": 4.10461224489796e-05, + "loss": 1.9477, + "step": 299000 + }, + { + "FLOPS loss": 0.07261373102664948, + "L0_d": 794.72, + "MLM loss": 1.878649353981018, + "epoch": 6.38, + "step": 299499 + }, + { + "epoch": 6.38, + "learning_rate": 4.0944081632653063e-05, + "loss": 1.9458, + "step": 299500 + }, + { + "FLOPS loss": 0.08437049388885498, + "L0_d": 1008.7, + "MLM loss": 1.8229589462280273, + "epoch": 6.39, + "step": 299999 + }, + { + "epoch": 6.39, + "learning_rate": 4.0842040816326535e-05, + "loss": 1.9434, + "step": 300000 + }, + { + "FLOPS loss": 0.08365792036056519, + "L0_d": 657.16, + "MLM loss": 1.788762092590332, + "epoch": 6.4, + "step": 300499 + }, + { + "epoch": 6.4, + "learning_rate": 4.0740204081632656e-05, + "loss": 1.9439, + "step": 300500 + }, + { + "FLOPS loss": 0.07193128764629364, + "L0_d": 711.5, + "MLM loss": 1.8217332363128662, + "epoch": 6.41, + "step": 300999 + }, + { + "epoch": 6.41, + "learning_rate": 4.063816326530612e-05, + "loss": 1.946, + "step": 301000 + }, + { + "FLOPS loss": 0.09581154584884644, + "L0_d": 1142.42, + "MLM loss": 1.8188657760620117, + "epoch": 6.42, + "step": 301499 + }, + { + "epoch": 6.42, + "learning_rate": 4.05361224489796e-05, + "loss": 1.945, + "step": 301500 + }, + { + "FLOPS loss": 0.08122092485427856, + "L0_d": 975.69, + "MLM loss": 1.894965648651123, + "epoch": 6.43, + "step": 301999 + }, + { + "epoch": 6.43, + "learning_rate": 4.043408163265306e-05, + "loss": 1.9434, + "step": 302000 + }, + { + "FLOPS loss": 0.061854831874370575, + "L0_d": 463.28, + "MLM loss": 1.888474702835083, + "epoch": 6.44, + "step": 302499 + }, + { + "epoch": 6.44, + "learning_rate": 4.0332040816326535e-05, + "loss": 1.9453, + "step": 302500 + }, + { + "FLOPS loss": 0.08056728541851044, + "L0_d": 863.88, + "MLM loss": 1.8775725364685059, + "epoch": 6.45, + "step": 302999 + }, + { + "epoch": 6.45, + "learning_rate": 4.0230204081632655e-05, + "loss": 1.9457, + "step": 303000 + }, + { + "FLOPS loss": 0.07168541103601456, + "L0_d": 606.09, + "MLM loss": 1.7786054611206055, + "epoch": 6.46, + "step": 303499 + }, + { + "epoch": 6.46, + "learning_rate": 4.012816326530613e-05, + "loss": 1.9436, + "step": 303500 + }, + { + "FLOPS loss": 0.08951392024755478, + "L0_d": 1066.7, + "MLM loss": 1.9425861835479736, + "epoch": 6.47, + "step": 303999 + }, + { + "epoch": 6.47, + "learning_rate": 4.002612244897959e-05, + "loss": 1.9439, + "step": 304000 + }, + { + "FLOPS loss": 0.09145156294107437, + "L0_d": 692.55, + "MLM loss": 2.026456832885742, + "epoch": 6.48, + "step": 304499 + }, + { + "epoch": 6.48, + "learning_rate": 3.992408163265306e-05, + "loss": 1.9466, + "step": 304500 + }, + { + "FLOPS loss": 0.06714989244937897, + "L0_d": 774.73, + "MLM loss": 1.8669438362121582, + "epoch": 6.5, + "step": 304999 + }, + { + "epoch": 6.5, + "learning_rate": 3.9822040816326534e-05, + "loss": 1.9479, + "step": 305000 + }, + { + "FLOPS loss": 0.06919477134943008, + "L0_d": 791.53, + "MLM loss": 1.8139466047286987, + "epoch": 6.51, + "step": 305499 + }, + { + "epoch": 6.51, + "learning_rate": 3.9720204081632655e-05, + "loss": 1.9475, + "step": 305500 + }, + { + "FLOPS loss": 0.08101382851600647, + "L0_d": 806.67, + "MLM loss": 1.8381314277648926, + "epoch": 6.52, + "step": 305999 + }, + { + "epoch": 6.52, + "learning_rate": 3.961816326530613e-05, + "loss": 1.9405, + "step": 306000 + }, + { + "FLOPS loss": 0.07709395885467529, + "L0_d": 754.17, + "MLM loss": 1.8039436340332031, + "epoch": 6.53, + "step": 306499 + }, + { + "epoch": 6.53, + "learning_rate": 3.951612244897959e-05, + "loss": 1.941, + "step": 306500 + }, + { + "FLOPS loss": 0.07149617373943329, + "L0_d": 715.22, + "MLM loss": 1.7575922012329102, + "epoch": 6.54, + "step": 306999 + }, + { + "epoch": 6.54, + "learning_rate": 3.941408163265306e-05, + "loss": 1.9399, + "step": 307000 + }, + { + "FLOPS loss": 0.09008293598890305, + "L0_d": 860.47, + "MLM loss": 1.9457957744598389, + "epoch": 6.55, + "step": 307499 + }, + { + "epoch": 6.55, + "learning_rate": 3.9312244897959184e-05, + "loss": 1.9408, + "step": 307500 + }, + { + "FLOPS loss": 0.06867194920778275, + "L0_d": 567.94, + "MLM loss": 1.8648253679275513, + "epoch": 6.56, + "step": 307999 + }, + { + "epoch": 6.56, + "learning_rate": 3.9210204081632655e-05, + "loss": 1.9429, + "step": 308000 + }, + { + "FLOPS loss": 0.07760652899742126, + "L0_d": 842.48, + "MLM loss": 1.827282428741455, + "epoch": 6.57, + "step": 308499 + }, + { + "epoch": 6.57, + "learning_rate": 3.9108163265306126e-05, + "loss": 1.9409, + "step": 308500 + }, + { + "FLOPS loss": 0.06471911817789078, + "L0_d": 742.31, + "MLM loss": 2.1198086738586426, + "epoch": 6.58, + "step": 308999 + }, + { + "epoch": 6.58, + "learning_rate": 3.900612244897959e-05, + "loss": 1.941, + "step": 309000 + }, + { + "FLOPS loss": 0.07304774969816208, + "L0_d": 655.27, + "MLM loss": 1.845359444618225, + "epoch": 6.59, + "step": 309499 + }, + { + "epoch": 6.59, + "learning_rate": 3.890428571428572e-05, + "loss": 1.9416, + "step": 309500 + }, + { + "FLOPS loss": 0.08457273244857788, + "L0_d": 622.2, + "MLM loss": 2.051377534866333, + "epoch": 6.6, + "step": 309999 + }, + { + "epoch": 6.6, + "learning_rate": 3.880224489795918e-05, + "loss": 1.9402, + "step": 310000 + }, + { + "FLOPS loss": 0.06825979799032211, + "L0_d": 514.53, + "MLM loss": 1.8188495635986328, + "epoch": 6.61, + "step": 310499 + }, + { + "epoch": 6.61, + "learning_rate": 3.8700204081632655e-05, + "loss": 1.9433, + "step": 310500 + }, + { + "FLOPS loss": 0.08188638836145401, + "L0_d": 932.84, + "MLM loss": 1.9714046716690063, + "epoch": 6.62, + "step": 310999 + }, + { + "epoch": 6.62, + "learning_rate": 3.8598163265306126e-05, + "loss": 1.9389, + "step": 311000 + }, + { + "FLOPS loss": 0.07594864070415497, + "L0_d": 988.8, + "MLM loss": 1.8642597198486328, + "epoch": 6.63, + "step": 311499 + }, + { + "epoch": 6.63, + "learning_rate": 3.849612244897959e-05, + "loss": 1.9414, + "step": 311500 + }, + { + "FLOPS loss": 0.07680333405733109, + "L0_d": 648.48, + "MLM loss": 1.8027818202972412, + "epoch": 6.64, + "step": 311999 + }, + { + "epoch": 6.64, + "learning_rate": 3.839408163265306e-05, + "loss": 1.94, + "step": 312000 + }, + { + "FLOPS loss": 0.0731196478009224, + "L0_d": 701.98, + "MLM loss": 1.8166855573654175, + "epoch": 6.65, + "step": 312499 + }, + { + "epoch": 6.65, + "learning_rate": 3.829224489795918e-05, + "loss": 1.9385, + "step": 312500 + }, + { + "FLOPS loss": 0.08190134912729263, + "L0_d": 796.22, + "MLM loss": 2.0480880737304688, + "epoch": 6.67, + "step": 312999 + }, + { + "epoch": 6.67, + "learning_rate": 3.8190204081632655e-05, + "loss": 1.9408, + "step": 313000 + }, + { + "FLOPS loss": 0.07585899531841278, + "L0_d": 712.28, + "MLM loss": 1.9030694961547852, + "epoch": 6.68, + "step": 313499 + }, + { + "epoch": 6.68, + "learning_rate": 3.8088163265306126e-05, + "loss": 1.9398, + "step": 313500 + }, + { + "FLOPS loss": 0.0705098956823349, + "L0_d": 658.12, + "MLM loss": 1.8781006336212158, + "epoch": 6.69, + "step": 313999 + }, + { + "epoch": 6.69, + "learning_rate": 3.798632653061225e-05, + "loss": 1.9411, + "step": 314000 + }, + { + "FLOPS loss": 0.0811757817864418, + "L0_d": 609.66, + "MLM loss": 1.8772015571594238, + "epoch": 6.7, + "step": 314499 + }, + { + "epoch": 6.7, + "learning_rate": 3.788428571428572e-05, + "loss": 1.9359, + "step": 314500 + }, + { + "FLOPS loss": 0.06603971868753433, + "L0_d": 583.78, + "MLM loss": 1.8940114974975586, + "epoch": 6.71, + "step": 314999 + }, + { + "epoch": 6.71, + "learning_rate": 3.778224489795918e-05, + "loss": 1.9385, + "step": 315000 + }, + { + "FLOPS loss": 0.07398807257413864, + "L0_d": 744.42, + "MLM loss": 1.8569490909576416, + "epoch": 6.72, + "step": 315499 + }, + { + "epoch": 6.72, + "learning_rate": 3.7680204081632654e-05, + "loss": 1.9415, + "step": 315500 + }, + { + "FLOPS loss": 0.06338933110237122, + "L0_d": 631.48, + "MLM loss": 1.8887548446655273, + "epoch": 6.73, + "step": 315999 + }, + { + "epoch": 6.73, + "learning_rate": 3.7578163265306126e-05, + "loss": 1.9367, + "step": 316000 + }, + { + "FLOPS loss": 0.06369595974683762, + "L0_d": 808.41, + "MLM loss": 1.889241337776184, + "epoch": 6.74, + "step": 316499 + }, + { + "epoch": 6.74, + "learning_rate": 3.74761224489796e-05, + "loss": 1.9429, + "step": 316500 + }, + { + "FLOPS loss": 0.06813719868659973, + "L0_d": 613.62, + "MLM loss": 1.9069278240203857, + "epoch": 6.75, + "step": 316999 + }, + { + "epoch": 6.75, + "learning_rate": 3.737408163265306e-05, + "loss": 1.9385, + "step": 317000 + }, + { + "FLOPS loss": 0.07158663868904114, + "L0_d": 745.84, + "MLM loss": 1.7616331577301025, + "epoch": 6.76, + "step": 317499 + }, + { + "epoch": 6.76, + "learning_rate": 3.7272040816326533e-05, + "loss": 1.9356, + "step": 317500 + }, + { + "FLOPS loss": 0.09117750078439713, + "L0_d": 624.55, + "MLM loss": 1.9352805614471436, + "epoch": 6.77, + "step": 317999 + }, + { + "epoch": 6.77, + "learning_rate": 3.7170204081632654e-05, + "loss": 1.9347, + "step": 318000 + }, + { + "FLOPS loss": 0.06324110925197601, + "L0_d": 602.06, + "MLM loss": 1.833200454711914, + "epoch": 6.78, + "step": 318499 + }, + { + "epoch": 6.78, + "learning_rate": 3.706816326530612e-05, + "loss": 1.9401, + "step": 318500 + }, + { + "FLOPS loss": 0.08155985921621323, + "L0_d": 966.45, + "MLM loss": 1.7821117639541626, + "epoch": 6.79, + "step": 318999 + }, + { + "epoch": 6.79, + "learning_rate": 3.69661224489796e-05, + "loss": 1.9365, + "step": 319000 + }, + { + "FLOPS loss": 0.06751174479722977, + "L0_d": 627.66, + "MLM loss": 1.6768591403961182, + "epoch": 6.8, + "step": 319499 + }, + { + "epoch": 6.8, + "learning_rate": 3.686408163265306e-05, + "loss": 1.932, + "step": 319500 + }, + { + "FLOPS loss": 0.08122416585683823, + "L0_d": 705.98, + "MLM loss": 1.8377563953399658, + "epoch": 6.81, + "step": 319999 + }, + { + "epoch": 6.81, + "learning_rate": 3.676204081632653e-05, + "loss": 1.9361, + "step": 320000 + }, + { + "FLOPS loss": 0.06721948832273483, + "L0_d": 708.16, + "MLM loss": 1.8212740421295166, + "epoch": 6.83, + "step": 320499 + }, + { + "epoch": 6.83, + "learning_rate": 3.6660204081632654e-05, + "loss": 1.9352, + "step": 320500 + }, + { + "FLOPS loss": 0.09321010857820511, + "L0_d": 1287.06, + "MLM loss": 1.8241631984710693, + "epoch": 6.84, + "step": 320999 + }, + { + "epoch": 6.84, + "learning_rate": 3.6558163265306125e-05, + "loss": 1.9323, + "step": 321000 + }, + { + "FLOPS loss": 0.06190552935004234, + "L0_d": 528.55, + "MLM loss": 1.9443321228027344, + "epoch": 6.85, + "step": 321499 + }, + { + "epoch": 6.85, + "learning_rate": 3.64561224489796e-05, + "loss": 1.936, + "step": 321500 + }, + { + "FLOPS loss": 0.061153117567300797, + "L0_d": 520.72, + "MLM loss": 1.832186222076416, + "epoch": 6.86, + "step": 321999 + }, + { + "epoch": 6.86, + "learning_rate": 3.635408163265306e-05, + "loss": 1.9342, + "step": 322000 + }, + { + "FLOPS loss": 0.09563008695840836, + "L0_d": 753.09, + "MLM loss": 1.8728309869766235, + "epoch": 6.87, + "step": 322499 + }, + { + "epoch": 6.87, + "learning_rate": 3.625224489795919e-05, + "loss": 1.9354, + "step": 322500 + }, + { + "FLOPS loss": 0.07957378774881363, + "L0_d": 795.86, + "MLM loss": 1.8879680633544922, + "epoch": 6.88, + "step": 322999 + }, + { + "epoch": 6.88, + "learning_rate": 3.6150204081632654e-05, + "loss": 1.9349, + "step": 323000 + }, + { + "FLOPS loss": 0.06421475112438202, + "L0_d": 706.42, + "MLM loss": 1.9722673892974854, + "epoch": 6.89, + "step": 323499 + }, + { + "epoch": 6.89, + "learning_rate": 3.6048163265306125e-05, + "loss": 1.936, + "step": 323500 + }, + { + "FLOPS loss": 0.0862734466791153, + "L0_d": 833.16, + "MLM loss": 1.8255963325500488, + "epoch": 6.9, + "step": 323999 + }, + { + "epoch": 6.9, + "learning_rate": 3.594612244897959e-05, + "loss": 1.934, + "step": 324000 + }, + { + "FLOPS loss": 0.06920310854911804, + "L0_d": 725.91, + "MLM loss": 1.8486316204071045, + "epoch": 6.91, + "step": 324499 + }, + { + "epoch": 6.91, + "learning_rate": 3.584408163265307e-05, + "loss": 1.9328, + "step": 324500 + }, + { + "FLOPS loss": 0.07331109046936035, + "L0_d": 611.11, + "MLM loss": 1.7338364124298096, + "epoch": 6.92, + "step": 324999 + }, + { + "epoch": 6.92, + "learning_rate": 3.574224489795919e-05, + "loss": 1.9357, + "step": 325000 + }, + { + "FLOPS loss": 0.06642386317253113, + "L0_d": 547.34, + "MLM loss": 1.8029348850250244, + "epoch": 6.93, + "step": 325499 + }, + { + "epoch": 6.93, + "learning_rate": 3.5640204081632654e-05, + "loss": 1.9292, + "step": 325500 + }, + { + "FLOPS loss": 0.06935901939868927, + "L0_d": 794.67, + "MLM loss": 2.0707836151123047, + "epoch": 6.94, + "step": 325999 + }, + { + "epoch": 6.94, + "learning_rate": 3.5538163265306125e-05, + "loss": 1.9316, + "step": 326000 + }, + { + "FLOPS loss": 0.0792873278260231, + "L0_d": 834.64, + "MLM loss": 1.8418869972229004, + "epoch": 6.95, + "step": 326499 + }, + { + "epoch": 6.95, + "learning_rate": 3.543612244897959e-05, + "loss": 1.9348, + "step": 326500 + }, + { + "FLOPS loss": 0.07487780600786209, + "L0_d": 580.11, + "MLM loss": 1.8635060787200928, + "epoch": 6.96, + "step": 326999 + }, + { + "epoch": 6.96, + "learning_rate": 3.533428571428572e-05, + "loss": 1.9305, + "step": 327000 + }, + { + "FLOPS loss": 0.08103643357753754, + "L0_d": 751.62, + "MLM loss": 1.7533881664276123, + "epoch": 6.97, + "step": 327499 + }, + { + "epoch": 6.97, + "learning_rate": 3.523224489795919e-05, + "loss": 1.9311, + "step": 327500 + }, + { + "FLOPS loss": 0.07984720915555954, + "L0_d": 857.66, + "MLM loss": 1.7085926532745361, + "epoch": 6.98, + "step": 327999 + }, + { + "epoch": 6.98, + "learning_rate": 3.513020408163265e-05, + "loss": 1.9311, + "step": 328000 + }, + { + "FLOPS loss": 0.09680966287851334, + "L0_d": 1293.66, + "MLM loss": 1.8928158283233643, + "epoch": 7.0, + "step": 328499 + }, + { + "epoch": 7.0, + "learning_rate": 3.5028163265306125e-05, + "loss": 1.9302, + "step": 328500 + }, + { + "FLOPS loss": 0.05976860970258713, + "L0_d": 521.03, + "MLM loss": 1.7658557891845703, + "epoch": 7.01, + "step": 328999 + }, + { + "epoch": 7.01, + "learning_rate": 3.4926326530612246e-05, + "loss": 1.9286, + "step": 329000 + }, + { + "FLOPS loss": 0.05823574215173721, + "L0_d": 734.66, + "MLM loss": 1.8498952388763428, + "epoch": 7.02, + "step": 329499 + }, + { + "epoch": 7.02, + "learning_rate": 3.482428571428572e-05, + "loss": 1.9281, + "step": 329500 + }, + { + "FLOPS loss": 0.09669086337089539, + "L0_d": 855.12, + "MLM loss": 1.881850242614746, + "epoch": 7.03, + "step": 329999 + }, + { + "epoch": 7.03, + "learning_rate": 3.472224489795918e-05, + "loss": 1.9277, + "step": 330000 + }, + { + "FLOPS loss": 0.08522310107946396, + "L0_d": 818.73, + "MLM loss": 1.786057710647583, + "epoch": 7.04, + "step": 330499 + }, + { + "epoch": 7.04, + "learning_rate": 3.462020408163266e-05, + "loss": 1.9314, + "step": 330500 + }, + { + "FLOPS loss": 0.06717081367969513, + "L0_d": 763.72, + "MLM loss": 1.8741955757141113, + "epoch": 7.05, + "step": 330999 + }, + { + "epoch": 7.05, + "learning_rate": 3.4518163265306125e-05, + "loss": 1.924, + "step": 331000 + }, + { + "FLOPS loss": 0.07524064928293228, + "L0_d": 709.78, + "MLM loss": 1.97847318649292, + "epoch": 7.06, + "step": 331499 + }, + { + "epoch": 7.06, + "learning_rate": 3.4416326530612245e-05, + "loss": 1.9289, + "step": 331500 + }, + { + "FLOPS loss": 0.06118454411625862, + "L0_d": 820.77, + "MLM loss": 1.9433472156524658, + "epoch": 7.07, + "step": 331999 + }, + { + "epoch": 7.07, + "learning_rate": 3.431428571428572e-05, + "loss": 1.9319, + "step": 332000 + }, + { + "FLOPS loss": 0.07941633462905884, + "L0_d": 688.64, + "MLM loss": 1.976243495941162, + "epoch": 7.08, + "step": 332499 + }, + { + "epoch": 7.08, + "learning_rate": 3.421224489795918e-05, + "loss": 1.9272, + "step": 332500 + }, + { + "FLOPS loss": 0.07291039079427719, + "L0_d": 806.14, + "MLM loss": 1.8062152862548828, + "epoch": 7.09, + "step": 332999 + }, + { + "epoch": 7.09, + "learning_rate": 3.411020408163266e-05, + "loss": 1.9243, + "step": 333000 + }, + { + "FLOPS loss": 0.07909004390239716, + "L0_d": 696.17, + "MLM loss": 1.8287863731384277, + "epoch": 7.1, + "step": 333499 + }, + { + "epoch": 7.1, + "learning_rate": 3.400836734693878e-05, + "loss": 1.9272, + "step": 333500 + }, + { + "FLOPS loss": 0.0881006196141243, + "L0_d": 717.56, + "MLM loss": 1.8593976497650146, + "epoch": 7.11, + "step": 333999 + }, + { + "epoch": 7.11, + "learning_rate": 3.3906326530612245e-05, + "loss": 1.9301, + "step": 334000 + }, + { + "FLOPS loss": 0.07782454788684845, + "L0_d": 1056.52, + "MLM loss": 1.8584022521972656, + "epoch": 7.12, + "step": 334499 + }, + { + "epoch": 7.12, + "learning_rate": 3.380428571428572e-05, + "loss": 1.9287, + "step": 334500 + }, + { + "FLOPS loss": 0.09458693116903305, + "L0_d": 1192.09, + "MLM loss": 1.8450138568878174, + "epoch": 7.13, + "step": 334999 + }, + { + "epoch": 7.13, + "learning_rate": 3.370224489795918e-05, + "loss": 1.9261, + "step": 335000 + }, + { + "FLOPS loss": 0.0697149708867073, + "L0_d": 766.3, + "MLM loss": 1.952140212059021, + "epoch": 7.14, + "step": 335499 + }, + { + "epoch": 7.14, + "learning_rate": 3.360020408163265e-05, + "loss": 1.9257, + "step": 335500 + }, + { + "FLOPS loss": 0.06831005960702896, + "L0_d": 805.2, + "MLM loss": 1.7946230173110962, + "epoch": 7.16, + "step": 335999 + }, + { + "epoch": 7.16, + "learning_rate": 3.3498367346938773e-05, + "loss": 1.9256, + "step": 336000 + }, + { + "FLOPS loss": 0.09496098756790161, + "L0_d": 825.23, + "MLM loss": 1.841353416442871, + "epoch": 7.17, + "step": 336499 + }, + { + "epoch": 7.17, + "learning_rate": 3.339632653061225e-05, + "loss": 1.9287, + "step": 336500 + }, + { + "FLOPS loss": 0.07120657712221146, + "L0_d": 649.5, + "MLM loss": 1.8039329051971436, + "epoch": 7.18, + "step": 336999 + }, + { + "epoch": 7.18, + "learning_rate": 3.3294285714285716e-05, + "loss": 1.9261, + "step": 337000 + }, + { + "FLOPS loss": 0.06209271773695946, + "L0_d": 679.39, + "MLM loss": 1.8043580055236816, + "epoch": 7.19, + "step": 337499 + }, + { + "epoch": 7.19, + "learning_rate": 3.319224489795919e-05, + "loss": 1.9278, + "step": 337500 + }, + { + "FLOPS loss": 0.07758375257253647, + "L0_d": 891.48, + "MLM loss": 1.850175142288208, + "epoch": 7.2, + "step": 337999 + }, + { + "epoch": 7.2, + "learning_rate": 3.309020408163265e-05, + "loss": 1.9252, + "step": 338000 + }, + { + "FLOPS loss": 0.07936911284923553, + "L0_d": 770.59, + "MLM loss": 1.8158717155456543, + "epoch": 7.21, + "step": 338499 + }, + { + "epoch": 7.21, + "learning_rate": 3.2988163265306124e-05, + "loss": 1.926, + "step": 338500 + }, + { + "FLOPS loss": 0.06472300738096237, + "L0_d": 691.17, + "MLM loss": 1.9313558340072632, + "epoch": 7.22, + "step": 338999 + }, + { + "epoch": 7.22, + "learning_rate": 3.288632653061225e-05, + "loss": 1.9242, + "step": 339000 + }, + { + "FLOPS loss": 0.08942249417304993, + "L0_d": 774.66, + "MLM loss": 1.9662246704101562, + "epoch": 7.23, + "step": 339499 + }, + { + "epoch": 7.23, + "learning_rate": 3.2784285714285716e-05, + "loss": 1.9267, + "step": 339500 + }, + { + "FLOPS loss": 0.0655161663889885, + "L0_d": 474.06, + "MLM loss": 1.895153522491455, + "epoch": 7.24, + "step": 339999 + }, + { + "epoch": 7.24, + "learning_rate": 3.268224489795919e-05, + "loss": 1.9222, + "step": 340000 + }, + { + "FLOPS loss": 0.0758613795042038, + "L0_d": 719.14, + "MLM loss": 1.8445680141448975, + "epoch": 7.25, + "step": 340499 + }, + { + "epoch": 7.25, + "learning_rate": 3.258020408163265e-05, + "loss": 1.9217, + "step": 340500 + }, + { + "FLOPS loss": 0.0741974487900734, + "L0_d": 599.89, + "MLM loss": 1.9803168773651123, + "epoch": 7.26, + "step": 340999 + }, + { + "epoch": 7.26, + "learning_rate": 3.2478163265306124e-05, + "loss": 1.9259, + "step": 341000 + }, + { + "FLOPS loss": 0.072549968957901, + "L0_d": 668.38, + "MLM loss": 1.759738802909851, + "epoch": 7.27, + "step": 341499 + }, + { + "epoch": 7.27, + "learning_rate": 3.2376326530612245e-05, + "loss": 1.9204, + "step": 341500 + }, + { + "FLOPS loss": 0.07621914148330688, + "L0_d": 1228.61, + "MLM loss": 1.8269237279891968, + "epoch": 7.28, + "step": 341999 + }, + { + "epoch": 7.28, + "learning_rate": 3.2274285714285716e-05, + "loss": 1.9188, + "step": 342000 + }, + { + "FLOPS loss": 0.07951581478118896, + "L0_d": 692.34, + "MLM loss": 1.728185772895813, + "epoch": 7.29, + "step": 342499 + }, + { + "epoch": 7.29, + "learning_rate": 3.217224489795919e-05, + "loss": 1.9277, + "step": 342500 + }, + { + "FLOPS loss": 0.06861908733844757, + "L0_d": 1225.88, + "MLM loss": 1.8209030628204346, + "epoch": 7.3, + "step": 342999 + }, + { + "epoch": 7.3, + "learning_rate": 3.207020408163265e-05, + "loss": 1.9211, + "step": 343000 + }, + { + "FLOPS loss": 0.08117002993822098, + "L0_d": 726.34, + "MLM loss": 1.8451335430145264, + "epoch": 7.32, + "step": 343499 + }, + { + "epoch": 7.32, + "learning_rate": 3.196836734693878e-05, + "loss": 1.9229, + "step": 343500 + }, + { + "FLOPS loss": 0.0719624012708664, + "L0_d": 743.8, + "MLM loss": 1.6969846487045288, + "epoch": 7.33, + "step": 343999 + }, + { + "epoch": 7.33, + "learning_rate": 3.1866326530612244e-05, + "loss": 1.9244, + "step": 344000 + }, + { + "FLOPS loss": 0.07608190178871155, + "L0_d": 922.83, + "MLM loss": 1.9612715244293213, + "epoch": 7.34, + "step": 344499 + }, + { + "epoch": 7.34, + "learning_rate": 3.1764285714285716e-05, + "loss": 1.923, + "step": 344500 + }, + { + "FLOPS loss": 0.07321670651435852, + "L0_d": 853.64, + "MLM loss": 1.928910732269287, + "epoch": 7.35, + "step": 344999 + }, + { + "epoch": 7.35, + "learning_rate": 3.166224489795919e-05, + "loss": 1.9226, + "step": 345000 + }, + { + "FLOPS loss": 0.07549431920051575, + "L0_d": 863.84, + "MLM loss": 1.8605866432189941, + "epoch": 7.36, + "step": 345499 + }, + { + "epoch": 7.36, + "learning_rate": 3.156020408163265e-05, + "loss": 1.9229, + "step": 345500 + }, + { + "FLOPS loss": 0.0844147577881813, + "L0_d": 718.67, + "MLM loss": 1.7773514986038208, + "epoch": 7.37, + "step": 345999 + }, + { + "epoch": 7.37, + "learning_rate": 3.1458163265306123e-05, + "loss": 1.9191, + "step": 346000 + }, + { + "FLOPS loss": 0.08949605375528336, + "L0_d": 1317.33, + "MLM loss": 1.8800100088119507, + "epoch": 7.38, + "step": 346499 + }, + { + "epoch": 7.38, + "learning_rate": 3.1356326530612244e-05, + "loss": 1.9245, + "step": 346500 + }, + { + "FLOPS loss": 0.060002170503139496, + "L0_d": 679.66, + "MLM loss": 1.8119897842407227, + "epoch": 7.39, + "step": 346999 + }, + { + "epoch": 7.39, + "learning_rate": 3.1254285714285716e-05, + "loss": 1.9225, + "step": 347000 + }, + { + "FLOPS loss": 0.0773637667298317, + "L0_d": 792.98, + "MLM loss": 1.8959424495697021, + "epoch": 7.4, + "step": 347499 + }, + { + "epoch": 7.4, + "learning_rate": 3.115224489795919e-05, + "loss": 1.9232, + "step": 347500 + }, + { + "FLOPS loss": 0.06032504886388779, + "L0_d": 722.84, + "MLM loss": 1.9673316478729248, + "epoch": 7.41, + "step": 347999 + }, + { + "epoch": 7.41, + "learning_rate": 3.105020408163266e-05, + "loss": 1.922, + "step": 348000 + }, + { + "FLOPS loss": 0.071210578083992, + "L0_d": 612.83, + "MLM loss": 1.831152319908142, + "epoch": 7.42, + "step": 348499 + }, + { + "epoch": 7.42, + "learning_rate": 3.094836734693878e-05, + "loss": 1.9212, + "step": 348500 + }, + { + "FLOPS loss": 0.07164246588945389, + "L0_d": 795.75, + "MLM loss": 1.7671644687652588, + "epoch": 7.43, + "step": 348999 + }, + { + "epoch": 7.43, + "learning_rate": 3.0846326530612244e-05, + "loss": 1.923, + "step": 349000 + }, + { + "FLOPS loss": 0.06108471378684044, + "L0_d": 598.73, + "MLM loss": 1.7506718635559082, + "epoch": 7.44, + "step": 349499 + }, + { + "epoch": 7.44, + "learning_rate": 3.0744285714285715e-05, + "loss": 1.9199, + "step": 349500 + }, + { + "FLOPS loss": 0.06783506274223328, + "L0_d": 765.02, + "MLM loss": 1.7566602230072021, + "epoch": 7.45, + "step": 349999 + }, + { + "epoch": 7.45, + "learning_rate": 3.064224489795918e-05, + "loss": 1.923, + "step": 350000 + }, + { + "FLOPS loss": 0.06063126027584076, + "L0_d": 623.61, + "MLM loss": 1.923569679260254, + "epoch": 7.46, + "step": 350499 + }, + { + "epoch": 7.46, + "learning_rate": 3.054040816326531e-05, + "loss": 1.923, + "step": 350500 + }, + { + "FLOPS loss": 0.08617284893989563, + "L0_d": 786.66, + "MLM loss": 1.870776653289795, + "epoch": 7.47, + "step": 350999 + }, + { + "epoch": 7.47, + "learning_rate": 3.043836734693878e-05, + "loss": 1.9196, + "step": 351000 + }, + { + "FLOPS loss": 0.0912565290927887, + "L0_d": 1055.14, + "MLM loss": 1.824958086013794, + "epoch": 7.49, + "step": 351499 + }, + { + "epoch": 7.49, + "learning_rate": 3.0336326530612247e-05, + "loss": 1.9217, + "step": 351500 + }, + { + "FLOPS loss": 0.07441375404596329, + "L0_d": 757.95, + "MLM loss": 1.9523437023162842, + "epoch": 7.5, + "step": 351999 + }, + { + "epoch": 7.5, + "learning_rate": 3.0234285714285715e-05, + "loss": 1.9204, + "step": 352000 + }, + { + "FLOPS loss": 0.06910394132137299, + "L0_d": 647.09, + "MLM loss": 1.7434977293014526, + "epoch": 7.51, + "step": 352499 + }, + { + "epoch": 7.51, + "learning_rate": 3.0132448979591836e-05, + "loss": 1.9194, + "step": 352500 + }, + { + "FLOPS loss": 0.07549700140953064, + "L0_d": 835.09, + "MLM loss": 1.9095675945281982, + "epoch": 7.52, + "step": 352999 + }, + { + "epoch": 7.52, + "learning_rate": 3.0030408163265304e-05, + "loss": 1.9195, + "step": 353000 + }, + { + "FLOPS loss": 0.06734315305948257, + "L0_d": 722.47, + "MLM loss": 1.8535124063491821, + "epoch": 7.53, + "step": 353499 + }, + { + "epoch": 7.53, + "learning_rate": 2.992836734693878e-05, + "loss": 1.9224, + "step": 353500 + }, + { + "FLOPS loss": 0.0765361413359642, + "L0_d": 696.53, + "MLM loss": 1.7221925258636475, + "epoch": 7.54, + "step": 353999 + }, + { + "epoch": 7.54, + "learning_rate": 2.9826326530612247e-05, + "loss": 1.9176, + "step": 354000 + }, + { + "FLOPS loss": 0.06397148221731186, + "L0_d": 667.94, + "MLM loss": 1.8591244220733643, + "epoch": 7.55, + "step": 354499 + }, + { + "epoch": 7.55, + "learning_rate": 2.9724285714285715e-05, + "loss": 1.9174, + "step": 354500 + }, + { + "FLOPS loss": 0.08621693402528763, + "L0_d": 1023.33, + "MLM loss": 1.831404209136963, + "epoch": 7.56, + "step": 354999 + }, + { + "epoch": 7.56, + "learning_rate": 2.962244897959184e-05, + "loss": 1.9169, + "step": 355000 + }, + { + "FLOPS loss": 0.06905224174261093, + "L0_d": 788.47, + "MLM loss": 1.9023628234863281, + "epoch": 7.57, + "step": 355499 + }, + { + "epoch": 7.57, + "learning_rate": 2.9520408163265307e-05, + "loss": 1.9171, + "step": 355500 + }, + { + "FLOPS loss": 0.07182815670967102, + "L0_d": 605.72, + "MLM loss": 1.8447587490081787, + "epoch": 7.58, + "step": 355999 + }, + { + "epoch": 7.58, + "learning_rate": 2.9418367346938775e-05, + "loss": 1.9181, + "step": 356000 + }, + { + "FLOPS loss": 0.06056385487318039, + "L0_d": 805.12, + "MLM loss": 1.8549630641937256, + "epoch": 7.59, + "step": 356499 + }, + { + "epoch": 7.59, + "learning_rate": 2.9316326530612247e-05, + "loss": 1.9198, + "step": 356500 + }, + { + "FLOPS loss": 0.07179979234933853, + "L0_d": 687.52, + "MLM loss": 1.7600245475769043, + "epoch": 7.6, + "step": 356999 + }, + { + "epoch": 7.6, + "learning_rate": 2.921448979591837e-05, + "loss": 1.9159, + "step": 357000 + }, + { + "FLOPS loss": 0.06811942905187607, + "L0_d": 618.5, + "MLM loss": 1.9086711406707764, + "epoch": 7.61, + "step": 357499 + }, + { + "epoch": 7.61, + "learning_rate": 2.911244897959184e-05, + "loss": 1.919, + "step": 357500 + }, + { + "FLOPS loss": 0.09158661961555481, + "L0_d": 780.5, + "MLM loss": 1.7335734367370605, + "epoch": 7.62, + "step": 357999 + }, + { + "epoch": 7.62, + "learning_rate": 2.9010408163265307e-05, + "loss": 1.9148, + "step": 358000 + }, + { + "FLOPS loss": 0.0640568807721138, + "L0_d": 632.23, + "MLM loss": 1.8534142971038818, + "epoch": 7.63, + "step": 358499 + }, + { + "epoch": 7.63, + "learning_rate": 2.8908367346938775e-05, + "loss": 1.9175, + "step": 358500 + }, + { + "FLOPS loss": 0.06531016528606415, + "L0_d": 839.8, + "MLM loss": 1.8377444744110107, + "epoch": 7.65, + "step": 358999 + }, + { + "epoch": 7.65, + "learning_rate": 2.880632653061225e-05, + "loss": 1.9144, + "step": 359000 + }, + { + "FLOPS loss": 0.08689986169338226, + "L0_d": 864.11, + "MLM loss": 1.752164363861084, + "epoch": 7.66, + "step": 359499 + }, + { + "epoch": 7.66, + "learning_rate": 2.870448979591837e-05, + "loss": 1.9173, + "step": 359500 + }, + { + "FLOPS loss": 0.08605177700519562, + "L0_d": 841.92, + "MLM loss": 2.0353498458862305, + "epoch": 7.67, + "step": 359999 + }, + { + "epoch": 7.67, + "learning_rate": 2.860244897959184e-05, + "loss": 1.915, + "step": 360000 + }, + { + "FLOPS loss": 0.08223269879817963, + "L0_d": 663.62, + "MLM loss": 1.9616049528121948, + "epoch": 7.68, + "step": 360499 + }, + { + "epoch": 7.68, + "learning_rate": 2.8500408163265307e-05, + "loss": 1.9187, + "step": 360500 + }, + { + "FLOPS loss": 0.06230027228593826, + "L0_d": 623.67, + "MLM loss": 1.8606956005096436, + "epoch": 7.69, + "step": 360999 + }, + { + "epoch": 7.69, + "learning_rate": 2.8398367346938775e-05, + "loss": 1.9192, + "step": 361000 + }, + { + "FLOPS loss": 0.08796975761651993, + "L0_d": 961.97, + "MLM loss": 1.7985609769821167, + "epoch": 7.7, + "step": 361499 + }, + { + "epoch": 7.7, + "learning_rate": 2.82965306122449e-05, + "loss": 1.9193, + "step": 361500 + }, + { + "FLOPS loss": 0.07598540186882019, + "L0_d": 680.53, + "MLM loss": 1.9214376211166382, + "epoch": 7.71, + "step": 361999 + }, + { + "epoch": 7.71, + "learning_rate": 2.8194489795918367e-05, + "loss": 1.9163, + "step": 362000 + }, + { + "FLOPS loss": 0.07575920224189758, + "L0_d": 729.17, + "MLM loss": 1.9614065885543823, + "epoch": 7.72, + "step": 362499 + }, + { + "epoch": 7.72, + "learning_rate": 2.809244897959184e-05, + "loss": 1.9169, + "step": 362500 + }, + { + "FLOPS loss": 0.07568804174661636, + "L0_d": 612.11, + "MLM loss": 1.794695258140564, + "epoch": 7.73, + "step": 362999 + }, + { + "epoch": 7.73, + "learning_rate": 2.7990408163265307e-05, + "loss": 1.9132, + "step": 363000 + }, + { + "FLOPS loss": 0.08963464200496674, + "L0_d": 716.73, + "MLM loss": 1.7673532962799072, + "epoch": 7.74, + "step": 363499 + }, + { + "epoch": 7.74, + "learning_rate": 2.788857142857143e-05, + "loss": 1.914, + "step": 363500 + }, + { + "FLOPS loss": 0.07700413465499878, + "L0_d": 543.23, + "MLM loss": 1.705705165863037, + "epoch": 7.75, + "step": 363999 + }, + { + "epoch": 7.75, + "learning_rate": 2.77865306122449e-05, + "loss": 1.9145, + "step": 364000 + }, + { + "FLOPS loss": 0.07794135063886642, + "L0_d": 697.8, + "MLM loss": 1.9613754749298096, + "epoch": 7.76, + "step": 364499 + }, + { + "epoch": 7.76, + "learning_rate": 2.7684489795918367e-05, + "loss": 1.9186, + "step": 364500 + }, + { + "FLOPS loss": 0.07350470125675201, + "L0_d": 785.53, + "MLM loss": 1.8970494270324707, + "epoch": 7.77, + "step": 364999 + }, + { + "epoch": 7.77, + "learning_rate": 2.7582448979591842e-05, + "loss": 1.9149, + "step": 365000 + }, + { + "FLOPS loss": 0.07593075931072235, + "L0_d": 1040.44, + "MLM loss": 1.8233039379119873, + "epoch": 7.78, + "step": 365499 + }, + { + "epoch": 7.78, + "learning_rate": 2.7480612244897962e-05, + "loss": 1.9167, + "step": 365500 + }, + { + "FLOPS loss": 0.07376305758953094, + "L0_d": 756.62, + "MLM loss": 1.7765004634857178, + "epoch": 7.79, + "step": 365999 + }, + { + "epoch": 7.79, + "learning_rate": 2.737857142857143e-05, + "loss": 1.9157, + "step": 366000 + }, + { + "FLOPS loss": 0.07158848643302917, + "L0_d": 556.3, + "MLM loss": 1.8171870708465576, + "epoch": 7.8, + "step": 366499 + }, + { + "epoch": 7.8, + "learning_rate": 2.72765306122449e-05, + "loss": 1.9129, + "step": 366500 + }, + { + "FLOPS loss": 0.06732846796512604, + "L0_d": 818.38, + "MLM loss": 1.8742527961730957, + "epoch": 7.82, + "step": 366999 + }, + { + "epoch": 7.82, + "learning_rate": 2.7174489795918367e-05, + "loss": 1.9129, + "step": 367000 + }, + { + "FLOPS loss": 0.06487272679805756, + "L0_d": 669.91, + "MLM loss": 1.7070577144622803, + "epoch": 7.83, + "step": 367499 + }, + { + "epoch": 7.83, + "learning_rate": 2.7072448979591835e-05, + "loss": 1.9126, + "step": 367500 + }, + { + "FLOPS loss": 0.08324155956506729, + "L0_d": 732.16, + "MLM loss": 1.7398138046264648, + "epoch": 7.84, + "step": 367999 + }, + { + "epoch": 7.84, + "learning_rate": 2.697061224489796e-05, + "loss": 1.9131, + "step": 368000 + }, + { + "FLOPS loss": 0.0730772539973259, + "L0_d": 628.33, + "MLM loss": 1.8350969552993774, + "epoch": 7.85, + "step": 368499 + }, + { + "epoch": 7.85, + "learning_rate": 2.686857142857143e-05, + "loss": 1.913, + "step": 368500 + }, + { + "FLOPS loss": 0.08549657464027405, + "L0_d": 948.14, + "MLM loss": 1.8315303325653076, + "epoch": 7.86, + "step": 368999 + }, + { + "epoch": 7.86, + "learning_rate": 2.67665306122449e-05, + "loss": 1.9119, + "step": 369000 + }, + { + "FLOPS loss": 0.08233258128166199, + "L0_d": 708.77, + "MLM loss": 1.9612047672271729, + "epoch": 7.87, + "step": 369499 + }, + { + "epoch": 7.87, + "learning_rate": 2.666448979591837e-05, + "loss": 1.9116, + "step": 369500 + }, + { + "FLOPS loss": 0.05898122489452362, + "L0_d": 617.86, + "MLM loss": 1.8039360046386719, + "epoch": 7.88, + "step": 369999 + }, + { + "epoch": 7.88, + "learning_rate": 2.656265306122449e-05, + "loss": 1.9108, + "step": 370000 + }, + { + "FLOPS loss": 0.06786539405584335, + "L0_d": 863.75, + "MLM loss": 1.921259880065918, + "epoch": 7.89, + "step": 370499 + }, + { + "epoch": 7.89, + "learning_rate": 2.646061224489796e-05, + "loss": 1.9133, + "step": 370500 + }, + { + "FLOPS loss": 0.09662061184644699, + "L0_d": 944.55, + "MLM loss": 1.8544368743896484, + "epoch": 7.9, + "step": 370999 + }, + { + "epoch": 7.9, + "learning_rate": 2.6358571428571433e-05, + "loss": 1.9118, + "step": 371000 + }, + { + "FLOPS loss": 0.07581643760204315, + "L0_d": 655.27, + "MLM loss": 1.8517892360687256, + "epoch": 7.91, + "step": 371499 + }, + { + "epoch": 7.91, + "learning_rate": 2.62565306122449e-05, + "loss": 1.9103, + "step": 371500 + }, + { + "FLOPS loss": 0.062037497758865356, + "L0_d": 596.05, + "MLM loss": 1.9065392017364502, + "epoch": 7.92, + "step": 371999 + }, + { + "epoch": 7.92, + "learning_rate": 2.6154693877551022e-05, + "loss": 1.9132, + "step": 372000 + }, + { + "FLOPS loss": 0.06359484791755676, + "L0_d": 685.69, + "MLM loss": 1.793999433517456, + "epoch": 7.93, + "step": 372499 + }, + { + "epoch": 7.93, + "learning_rate": 2.605265306122449e-05, + "loss": 1.9114, + "step": 372500 + }, + { + "FLOPS loss": 0.06487156450748444, + "L0_d": 1092.2, + "MLM loss": 1.8625104427337646, + "epoch": 7.94, + "step": 372999 + }, + { + "epoch": 7.94, + "learning_rate": 2.595061224489796e-05, + "loss": 1.9142, + "step": 373000 + }, + { + "FLOPS loss": 0.09129762649536133, + "L0_d": 820.39, + "MLM loss": 1.8764865398406982, + "epoch": 7.95, + "step": 373499 + }, + { + "epoch": 7.95, + "learning_rate": 2.5848571428571426e-05, + "loss": 1.9147, + "step": 373500 + }, + { + "FLOPS loss": 0.08218321949243546, + "L0_d": 650.84, + "MLM loss": 1.7837188243865967, + "epoch": 7.96, + "step": 373999 + }, + { + "epoch": 7.96, + "learning_rate": 2.57465306122449e-05, + "loss": 1.9108, + "step": 374000 + }, + { + "FLOPS loss": 0.08408604562282562, + "L0_d": 712.58, + "MLM loss": 1.7383946180343628, + "epoch": 7.98, + "step": 374499 + }, + { + "epoch": 7.98, + "learning_rate": 2.564448979591837e-05, + "loss": 1.9127, + "step": 374500 + }, + { + "FLOPS loss": 0.07430750876665115, + "L0_d": 710.8, + "MLM loss": 1.8748829364776611, + "epoch": 7.99, + "step": 374999 + }, + { + "epoch": 7.99, + "learning_rate": 2.5542653061224494e-05, + "loss": 1.9116, + "step": 375000 + }, + { + "FLOPS loss": 0.06837387382984161, + "L0_d": 639.45, + "MLM loss": 1.8347387313842773, + "epoch": 8.0, + "step": 375499 + }, + { + "epoch": 8.0, + "learning_rate": 2.544061224489796e-05, + "loss": 1.9107, + "step": 375500 + }, + { + "FLOPS loss": 0.07094568014144897, + "L0_d": 625.97, + "MLM loss": 1.9303233623504639, + "epoch": 8.01, + "step": 375999 + }, + { + "epoch": 8.01, + "learning_rate": 2.533857142857143e-05, + "loss": 1.9096, + "step": 376000 + }, + { + "FLOPS loss": 0.07427439838647842, + "L0_d": 590.55, + "MLM loss": 1.683091640472412, + "epoch": 8.02, + "step": 376499 + }, + { + "epoch": 8.02, + "learning_rate": 2.52365306122449e-05, + "loss": 1.9121, + "step": 376500 + }, + { + "FLOPS loss": 0.09582684934139252, + "L0_d": 720.77, + "MLM loss": 1.9093230962753296, + "epoch": 8.03, + "step": 376999 + }, + { + "epoch": 8.03, + "learning_rate": 2.513448979591837e-05, + "loss": 1.9072, + "step": 377000 + }, + { + "FLOPS loss": 0.06721869856119156, + "L0_d": 743.12, + "MLM loss": 1.7846167087554932, + "epoch": 8.04, + "step": 377499 + }, + { + "epoch": 8.04, + "learning_rate": 2.5032653061224493e-05, + "loss": 1.908, + "step": 377500 + }, + { + "FLOPS loss": 0.08038800954818726, + "L0_d": 802.34, + "MLM loss": 1.8537399768829346, + "epoch": 8.05, + "step": 377999 + }, + { + "epoch": 8.05, + "learning_rate": 2.493061224489796e-05, + "loss": 1.9074, + "step": 378000 + }, + { + "FLOPS loss": 0.06283194571733475, + "L0_d": 806.69, + "MLM loss": 1.9123787879943848, + "epoch": 8.06, + "step": 378499 + }, + { + "epoch": 8.06, + "learning_rate": 2.482857142857143e-05, + "loss": 1.9098, + "step": 378500 + }, + { + "FLOPS loss": 0.07149063050746918, + "L0_d": 930.73, + "MLM loss": 2.009641408920288, + "epoch": 8.07, + "step": 378999 + }, + { + "epoch": 8.07, + "learning_rate": 2.47265306122449e-05, + "loss": 1.9081, + "step": 379000 + }, + { + "FLOPS loss": 0.06387903541326523, + "L0_d": 712.73, + "MLM loss": 1.7018380165100098, + "epoch": 8.08, + "step": 379499 + }, + { + "epoch": 8.08, + "learning_rate": 2.462469387755102e-05, + "loss": 1.9062, + "step": 379500 + }, + { + "FLOPS loss": 0.07200448960065842, + "L0_d": 905.91, + "MLM loss": 1.6971787214279175, + "epoch": 8.09, + "step": 379999 + }, + { + "epoch": 8.09, + "learning_rate": 2.452265306122449e-05, + "loss": 1.9014, + "step": 380000 + }, + { + "FLOPS loss": 0.09073708951473236, + "L0_d": 840.09, + "MLM loss": 1.6935443878173828, + "epoch": 8.1, + "step": 380499 + }, + { + "epoch": 8.1, + "learning_rate": 2.442061224489796e-05, + "loss": 1.9123, + "step": 380500 + }, + { + "FLOPS loss": 0.06841748207807541, + "L0_d": 746.36, + "MLM loss": 1.880540132522583, + "epoch": 8.11, + "step": 380999 + }, + { + "epoch": 8.11, + "learning_rate": 2.431857142857143e-05, + "loss": 1.8999, + "step": 381000 + }, + { + "FLOPS loss": 0.055594101548194885, + "L0_d": 584.33, + "MLM loss": 1.7323696613311768, + "epoch": 8.12, + "step": 381499 + }, + { + "epoch": 8.12, + "learning_rate": 2.4216734693877553e-05, + "loss": 1.9055, + "step": 381500 + }, + { + "FLOPS loss": 0.06862866133451462, + "L0_d": 639.55, + "MLM loss": 1.7428958415985107, + "epoch": 8.13, + "step": 381999 + }, + { + "epoch": 8.13, + "learning_rate": 2.411469387755102e-05, + "loss": 1.9064, + "step": 382000 + }, + { + "FLOPS loss": 0.07999974489212036, + "L0_d": 834.3, + "MLM loss": 1.8679862022399902, + "epoch": 8.15, + "step": 382499 + }, + { + "epoch": 8.15, + "learning_rate": 2.401265306122449e-05, + "loss": 1.9049, + "step": 382500 + }, + { + "FLOPS loss": 0.06594076007604599, + "L0_d": 671.27, + "MLM loss": 1.7892141342163086, + "epoch": 8.16, + "step": 382999 + }, + { + "epoch": 8.16, + "learning_rate": 2.3910612244897958e-05, + "loss": 1.9086, + "step": 383000 + }, + { + "FLOPS loss": 0.08169030398130417, + "L0_d": 679.23, + "MLM loss": 1.7941107749938965, + "epoch": 8.17, + "step": 383499 + }, + { + "epoch": 8.17, + "learning_rate": 2.380877551020408e-05, + "loss": 1.9062, + "step": 383500 + }, + { + "FLOPS loss": 0.08979839831590652, + "L0_d": 802.62, + "MLM loss": 1.776297688484192, + "epoch": 8.18, + "step": 383999 + }, + { + "epoch": 8.18, + "learning_rate": 2.3706734693877553e-05, + "loss": 1.9051, + "step": 384000 + }, + { + "FLOPS loss": 0.08627878129482269, + "L0_d": 852.94, + "MLM loss": 2.0511772632598877, + "epoch": 8.19, + "step": 384499 + }, + { + "epoch": 8.19, + "learning_rate": 2.360469387755102e-05, + "loss": 1.9053, + "step": 384500 + }, + { + "FLOPS loss": 0.0771448090672493, + "L0_d": 644.89, + "MLM loss": 1.7911652326583862, + "epoch": 8.2, + "step": 384999 + }, + { + "epoch": 8.2, + "learning_rate": 2.3502653061224493e-05, + "loss": 1.9049, + "step": 385000 + }, + { + "FLOPS loss": 0.09200733155012131, + "L0_d": 624.42, + "MLM loss": 1.8275014162063599, + "epoch": 8.21, + "step": 385499 + }, + { + "epoch": 8.21, + "learning_rate": 2.340061224489796e-05, + "loss": 1.9085, + "step": 385500 + }, + { + "FLOPS loss": 0.0631524920463562, + "L0_d": 915.19, + "MLM loss": 1.7387886047363281, + "epoch": 8.22, + "step": 385999 + }, + { + "epoch": 8.22, + "learning_rate": 2.329877551020408e-05, + "loss": 1.9045, + "step": 386000 + }, + { + "FLOPS loss": 0.07859036326408386, + "L0_d": 814.27, + "MLM loss": 1.9064040184020996, + "epoch": 8.23, + "step": 386499 + }, + { + "epoch": 8.23, + "learning_rate": 2.3196734693877553e-05, + "loss": 1.9028, + "step": 386500 + }, + { + "FLOPS loss": 0.08318474888801575, + "L0_d": 662.44, + "MLM loss": 1.8862558603286743, + "epoch": 8.24, + "step": 386999 + }, + { + "epoch": 8.24, + "learning_rate": 2.309469387755102e-05, + "loss": 1.9052, + "step": 387000 + }, + { + "FLOPS loss": 0.07222796231508255, + "L0_d": 875.73, + "MLM loss": 1.7195855379104614, + "epoch": 8.25, + "step": 387499 + }, + { + "epoch": 8.25, + "learning_rate": 2.2992653061224493e-05, + "loss": 1.9057, + "step": 387500 + }, + { + "FLOPS loss": 0.06394381076097488, + "L0_d": 656.22, + "MLM loss": 1.7930631637573242, + "epoch": 8.26, + "step": 387999 + }, + { + "epoch": 8.26, + "learning_rate": 2.2890816326530613e-05, + "loss": 1.9036, + "step": 388000 + }, + { + "FLOPS loss": 0.08451195806264877, + "L0_d": 1135.05, + "MLM loss": 1.8217693567276, + "epoch": 8.27, + "step": 388499 + }, + { + "epoch": 8.27, + "learning_rate": 2.278877551020408e-05, + "loss": 1.9043, + "step": 388500 + }, + { + "FLOPS loss": 0.0837959423661232, + "L0_d": 694.78, + "MLM loss": 1.817704677581787, + "epoch": 8.28, + "step": 388999 + }, + { + "epoch": 8.28, + "learning_rate": 2.268673469387755e-05, + "loss": 1.9049, + "step": 389000 + }, + { + "FLOPS loss": 0.08096183836460114, + "L0_d": 749.94, + "MLM loss": 1.900554895401001, + "epoch": 8.29, + "step": 389499 + }, + { + "epoch": 8.29, + "learning_rate": 2.258469387755102e-05, + "loss": 1.9013, + "step": 389500 + }, + { + "FLOPS loss": 0.07682886719703674, + "L0_d": 733.36, + "MLM loss": 1.9024741649627686, + "epoch": 8.31, + "step": 389999 + }, + { + "epoch": 8.31, + "learning_rate": 2.2482857142857145e-05, + "loss": 1.9057, + "step": 390000 + }, + { + "FLOPS loss": 0.07478464394807816, + "L0_d": 660.47, + "MLM loss": 1.8336386680603027, + "epoch": 8.32, + "step": 390499 + }, + { + "epoch": 8.32, + "learning_rate": 2.2380816326530613e-05, + "loss": 1.9003, + "step": 390500 + }, + { + "FLOPS loss": 0.06300283223390579, + "L0_d": 685.8, + "MLM loss": 1.796729564666748, + "epoch": 8.33, + "step": 390999 + }, + { + "epoch": 8.33, + "learning_rate": 2.2278775510204084e-05, + "loss": 1.9027, + "step": 391000 + }, + { + "FLOPS loss": 0.07821809500455856, + "L0_d": 776.97, + "MLM loss": 1.9271748065948486, + "epoch": 8.34, + "step": 391499 + }, + { + "epoch": 8.34, + "learning_rate": 2.2176734693877553e-05, + "loss": 1.9022, + "step": 391500 + }, + { + "FLOPS loss": 0.07604720443487167, + "L0_d": 684.19, + "MLM loss": 1.953967809677124, + "epoch": 8.35, + "step": 391999 + }, + { + "epoch": 8.35, + "learning_rate": 2.2074693877551024e-05, + "loss": 1.9029, + "step": 392000 + }, + { + "FLOPS loss": 0.0781513974070549, + "L0_d": 887.06, + "MLM loss": 1.827120304107666, + "epoch": 8.36, + "step": 392499 + }, + { + "epoch": 8.36, + "learning_rate": 2.1972857142857145e-05, + "loss": 1.9017, + "step": 392500 + }, + { + "FLOPS loss": 0.06326673179864883, + "L0_d": 775.97, + "MLM loss": 1.8110060691833496, + "epoch": 8.37, + "step": 392999 + }, + { + "epoch": 8.37, + "learning_rate": 2.1870816326530613e-05, + "loss": 1.9043, + "step": 393000 + }, + { + "FLOPS loss": 0.08700016140937805, + "L0_d": 814.06, + "MLM loss": 1.7394301891326904, + "epoch": 8.38, + "step": 393499 + }, + { + "epoch": 8.38, + "learning_rate": 2.176877551020408e-05, + "loss": 1.8996, + "step": 393500 + }, + { + "FLOPS loss": 0.07348722219467163, + "L0_d": 698.36, + "MLM loss": 1.963762640953064, + "epoch": 8.39, + "step": 393999 + }, + { + "epoch": 8.39, + "learning_rate": 2.1666734693877552e-05, + "loss": 1.9034, + "step": 394000 + }, + { + "FLOPS loss": 0.08353521674871445, + "L0_d": 892.64, + "MLM loss": 1.8784441947937012, + "epoch": 8.4, + "step": 394499 + }, + { + "epoch": 8.4, + "learning_rate": 2.156469387755102e-05, + "loss": 1.8994, + "step": 394500 + }, + { + "FLOPS loss": 0.06172090768814087, + "L0_d": 537.95, + "MLM loss": 1.8824307918548584, + "epoch": 8.41, + "step": 394999 + }, + { + "epoch": 8.41, + "learning_rate": 2.1462857142857145e-05, + "loss": 1.9054, + "step": 395000 + }, + { + "FLOPS loss": 0.06748291105031967, + "L0_d": 672.89, + "MLM loss": 1.723147988319397, + "epoch": 8.42, + "step": 395499 + }, + { + "epoch": 8.42, + "learning_rate": 2.1360816326530613e-05, + "loss": 1.904, + "step": 395500 + }, + { + "FLOPS loss": 0.07689233869314194, + "L0_d": 822.42, + "MLM loss": 1.7123825550079346, + "epoch": 8.43, + "step": 395999 + }, + { + "epoch": 8.43, + "learning_rate": 2.125877551020408e-05, + "loss": 1.9018, + "step": 396000 + }, + { + "FLOPS loss": 0.0659564808011055, + "L0_d": 635.97, + "MLM loss": 1.9036650657653809, + "epoch": 8.44, + "step": 396499 + }, + { + "epoch": 8.44, + "learning_rate": 2.1156734693877552e-05, + "loss": 1.8999, + "step": 396500 + }, + { + "FLOPS loss": 0.07588682323694229, + "L0_d": 804.62, + "MLM loss": 1.774322271347046, + "epoch": 8.45, + "step": 396999 + }, + { + "epoch": 8.45, + "learning_rate": 2.105469387755102e-05, + "loss": 1.9002, + "step": 397000 + }, + { + "FLOPS loss": 0.05796373263001442, + "L0_d": 603.67, + "MLM loss": 1.7872540950775146, + "epoch": 8.46, + "step": 397499 + }, + { + "epoch": 8.47, + "learning_rate": 2.0952857142857144e-05, + "loss": 1.899, + "step": 397500 + }, + { + "FLOPS loss": 0.06982676684856415, + "L0_d": 703.95, + "MLM loss": 1.8720976114273071, + "epoch": 8.48, + "step": 397999 + }, + { + "epoch": 8.48, + "learning_rate": 2.0850816326530616e-05, + "loss": 1.902, + "step": 398000 + }, + { + "FLOPS loss": 0.06411126255989075, + "L0_d": 640.84, + "MLM loss": 1.9107850790023804, + "epoch": 8.49, + "step": 398499 + }, + { + "epoch": 8.49, + "learning_rate": 2.0748775510204084e-05, + "loss": 1.8978, + "step": 398500 + }, + { + "FLOPS loss": 0.0808933675289154, + "L0_d": 718.69, + "MLM loss": 1.850345492362976, + "epoch": 8.5, + "step": 398999 + }, + { + "epoch": 8.5, + "learning_rate": 2.0646734693877552e-05, + "loss": 1.8977, + "step": 399000 + }, + { + "FLOPS loss": 0.0982719212770462, + "L0_d": 1116.62, + "MLM loss": 1.8427366018295288, + "epoch": 8.51, + "step": 399499 + }, + { + "epoch": 8.51, + "learning_rate": 2.0544897959183673e-05, + "loss": 1.9048, + "step": 399500 + }, + { + "FLOPS loss": 0.08528366684913635, + "L0_d": 807.28, + "MLM loss": 1.8523929119110107, + "epoch": 8.52, + "step": 399999 + }, + { + "epoch": 8.52, + "learning_rate": 2.0442857142857144e-05, + "loss": 1.8993, + "step": 400000 + }, + { + "FLOPS loss": 0.07065005600452423, + "L0_d": 602.44, + "MLM loss": 1.8898108005523682, + "epoch": 8.53, + "step": 400499 + }, + { + "epoch": 8.53, + "learning_rate": 2.0340816326530612e-05, + "loss": 1.8997, + "step": 400500 + }, + { + "FLOPS loss": 0.07828733325004578, + "L0_d": 999.39, + "MLM loss": 1.8211073875427246, + "epoch": 8.54, + "step": 400999 + }, + { + "epoch": 8.54, + "learning_rate": 2.0238775510204084e-05, + "loss": 1.9008, + "step": 401000 + }, + { + "FLOPS loss": 0.07446083426475525, + "L0_d": 747.81, + "MLM loss": 1.7699792385101318, + "epoch": 8.55, + "step": 401499 + }, + { + "epoch": 8.55, + "learning_rate": 2.0136938775510204e-05, + "loss": 1.8991, + "step": 401500 + }, + { + "FLOPS loss": 0.06935106962919235, + "L0_d": 611.12, + "MLM loss": 1.894187569618225, + "epoch": 8.56, + "step": 401999 + }, + { + "epoch": 8.56, + "learning_rate": 2.0034897959183672e-05, + "loss": 1.901, + "step": 402000 + }, + { + "FLOPS loss": 0.0887903943657875, + "L0_d": 788.98, + "MLM loss": 1.712061882019043, + "epoch": 8.57, + "step": 402499 + }, + { + "epoch": 8.57, + "learning_rate": 1.9932857142857144e-05, + "loss": 1.8946, + "step": 402500 + }, + { + "FLOPS loss": 0.0613437294960022, + "L0_d": 597.14, + "MLM loss": 1.7486085891723633, + "epoch": 8.58, + "step": 402999 + }, + { + "epoch": 8.58, + "learning_rate": 1.9830816326530612e-05, + "loss": 1.901, + "step": 403000 + }, + { + "FLOPS loss": 0.07355440407991409, + "L0_d": 805.77, + "MLM loss": 1.9495327472686768, + "epoch": 8.59, + "step": 403499 + }, + { + "epoch": 8.59, + "learning_rate": 1.9728979591836736e-05, + "loss": 1.8964, + "step": 403500 + }, + { + "FLOPS loss": 0.07202013581991196, + "L0_d": 770.02, + "MLM loss": 2.0217814445495605, + "epoch": 8.6, + "step": 403999 + }, + { + "epoch": 8.6, + "learning_rate": 1.9626938775510208e-05, + "loss": 1.8962, + "step": 404000 + }, + { + "FLOPS loss": 0.07498647272586823, + "L0_d": 800.06, + "MLM loss": 1.692859411239624, + "epoch": 8.61, + "step": 404499 + }, + { + "epoch": 8.61, + "learning_rate": 1.9524897959183676e-05, + "loss": 1.8966, + "step": 404500 + }, + { + "FLOPS loss": 0.06827228516340256, + "L0_d": 777.12, + "MLM loss": 1.7969110012054443, + "epoch": 8.62, + "step": 404999 + }, + { + "epoch": 8.62, + "learning_rate": 1.9422857142857144e-05, + "loss": 1.8969, + "step": 405000 + }, + { + "FLOPS loss": 0.07816651463508606, + "L0_d": 755.45, + "MLM loss": 1.8361034393310547, + "epoch": 8.64, + "step": 405499 + }, + { + "epoch": 8.64, + "learning_rate": 1.9321020408163264e-05, + "loss": 1.9007, + "step": 405500 + }, + { + "FLOPS loss": 0.07840103656053543, + "L0_d": 852.75, + "MLM loss": 1.8891184329986572, + "epoch": 8.65, + "step": 405999 + }, + { + "epoch": 8.65, + "learning_rate": 1.9218979591836736e-05, + "loss": 1.8984, + "step": 406000 + }, + { + "FLOPS loss": 0.06752259284257889, + "L0_d": 679.36, + "MLM loss": 1.6485272645950317, + "epoch": 8.66, + "step": 406499 + }, + { + "epoch": 8.66, + "learning_rate": 1.9116938775510204e-05, + "loss": 1.8959, + "step": 406500 + }, + { + "FLOPS loss": 0.07557836920022964, + "L0_d": 606.3, + "MLM loss": 1.8115220069885254, + "epoch": 8.67, + "step": 406999 + }, + { + "epoch": 8.67, + "learning_rate": 1.9014897959183675e-05, + "loss": 1.8979, + "step": 407000 + }, + { + "FLOPS loss": 0.07810939103364944, + "L0_d": 803.3, + "MLM loss": 1.8370587825775146, + "epoch": 8.68, + "step": 407499 + }, + { + "epoch": 8.68, + "learning_rate": 1.8913061224489796e-05, + "loss": 1.8973, + "step": 407500 + }, + { + "FLOPS loss": 0.07769665867090225, + "L0_d": 854.0, + "MLM loss": 1.7443437576293945, + "epoch": 8.69, + "step": 407999 + }, + { + "epoch": 8.69, + "learning_rate": 1.8811020408163264e-05, + "loss": 1.9003, + "step": 408000 + }, + { + "FLOPS loss": 0.07340855151414871, + "L0_d": 762.5, + "MLM loss": 1.8406531810760498, + "epoch": 8.7, + "step": 408499 + }, + { + "epoch": 8.7, + "learning_rate": 1.8708979591836736e-05, + "loss": 1.8983, + "step": 408500 + }, + { + "FLOPS loss": 0.07528545707464218, + "L0_d": 656.95, + "MLM loss": 1.853143572807312, + "epoch": 8.71, + "step": 408999 + }, + { + "epoch": 8.71, + "learning_rate": 1.8606938775510204e-05, + "loss": 1.8998, + "step": 409000 + }, + { + "FLOPS loss": 0.09381501376628876, + "L0_d": 876.23, + "MLM loss": 1.7811716794967651, + "epoch": 8.72, + "step": 409499 + }, + { + "epoch": 8.72, + "learning_rate": 1.8504897959183672e-05, + "loss": 1.8971, + "step": 409500 + }, + { + "FLOPS loss": 0.09534640610218048, + "L0_d": 803.41, + "MLM loss": 1.7951806783676147, + "epoch": 8.73, + "step": 409999 + }, + { + "epoch": 8.73, + "learning_rate": 1.8403061224489796e-05, + "loss": 1.8951, + "step": 410000 + }, + { + "FLOPS loss": 0.06121053919196129, + "L0_d": 595.77, + "MLM loss": 1.647636890411377, + "epoch": 8.74, + "step": 410499 + }, + { + "epoch": 8.74, + "learning_rate": 1.8301020408163267e-05, + "loss": 1.8974, + "step": 410500 + }, + { + "FLOPS loss": 0.06547104567289352, + "L0_d": 565.81, + "MLM loss": 1.7979687452316284, + "epoch": 8.75, + "step": 410999 + }, + { + "epoch": 8.75, + "learning_rate": 1.8198979591836735e-05, + "loss": 1.8961, + "step": 411000 + }, + { + "FLOPS loss": 0.09215452522039413, + "L0_d": 841.16, + "MLM loss": 1.897557258605957, + "epoch": 8.76, + "step": 411499 + }, + { + "epoch": 8.76, + "learning_rate": 1.8096938775510207e-05, + "loss": 1.8968, + "step": 411500 + }, + { + "FLOPS loss": 0.08165416121482849, + "L0_d": 823.58, + "MLM loss": 1.8826181888580322, + "epoch": 8.77, + "step": 411999 + }, + { + "epoch": 8.77, + "learning_rate": 1.7994897959183675e-05, + "loss": 1.8971, + "step": 412000 + }, + { + "FLOPS loss": 0.07333676517009735, + "L0_d": 659.52, + "MLM loss": 1.7057745456695557, + "epoch": 8.78, + "step": 412499 + }, + { + "epoch": 8.78, + "learning_rate": 1.7893061224489796e-05, + "loss": 1.8954, + "step": 412500 + }, + { + "FLOPS loss": 0.06546351313591003, + "L0_d": 597.67, + "MLM loss": 1.8420034646987915, + "epoch": 8.8, + "step": 412999 + }, + { + "epoch": 8.8, + "learning_rate": 1.7791020408163267e-05, + "loss": 1.8968, + "step": 413000 + }, + { + "FLOPS loss": 0.06565094739198685, + "L0_d": 615.06, + "MLM loss": 1.8575172424316406, + "epoch": 8.81, + "step": 413499 + }, + { + "epoch": 8.81, + "learning_rate": 1.7688979591836735e-05, + "loss": 1.8928, + "step": 413500 + }, + { + "FLOPS loss": 0.06962642818689346, + "L0_d": 819.78, + "MLM loss": 1.8613834381103516, + "epoch": 8.82, + "step": 413999 + }, + { + "epoch": 8.82, + "learning_rate": 1.7586938775510207e-05, + "loss": 1.8949, + "step": 414000 + }, + { + "FLOPS loss": 0.08159743249416351, + "L0_d": 831.28, + "MLM loss": 1.8682200908660889, + "epoch": 8.83, + "step": 414499 + }, + { + "epoch": 8.83, + "learning_rate": 1.7485102040816327e-05, + "loss": 1.8959, + "step": 414500 + }, + { + "FLOPS loss": 0.08759882301092148, + "L0_d": 845.52, + "MLM loss": 1.7896513938903809, + "epoch": 8.84, + "step": 414999 + }, + { + "epoch": 8.84, + "learning_rate": 1.7383061224489796e-05, + "loss": 1.8955, + "step": 415000 + }, + { + "FLOPS loss": 0.0938563421368599, + "L0_d": 1010.78, + "MLM loss": 1.7237412929534912, + "epoch": 8.85, + "step": 415499 + }, + { + "epoch": 8.85, + "learning_rate": 1.7281020408163264e-05, + "loss": 1.8961, + "step": 415500 + }, + { + "FLOPS loss": 0.07220235466957092, + "L0_d": 654.64, + "MLM loss": 1.8532274961471558, + "epoch": 8.86, + "step": 415999 + }, + { + "epoch": 8.86, + "learning_rate": 1.7178979591836735e-05, + "loss": 1.895, + "step": 416000 + }, + { + "FLOPS loss": 0.06810647994279861, + "L0_d": 613.75, + "MLM loss": 2.043182849884033, + "epoch": 8.87, + "step": 416499 + }, + { + "epoch": 8.87, + "learning_rate": 1.707714285714286e-05, + "loss": 1.8943, + "step": 416500 + }, + { + "FLOPS loss": 0.07071580737829208, + "L0_d": 661.27, + "MLM loss": 1.9305920600891113, + "epoch": 8.88, + "step": 416999 + }, + { + "epoch": 8.88, + "learning_rate": 1.6975102040816327e-05, + "loss": 1.8946, + "step": 417000 + }, + { + "FLOPS loss": 0.06806696206331253, + "L0_d": 770.3, + "MLM loss": 1.8732974529266357, + "epoch": 8.89, + "step": 417499 + }, + { + "epoch": 8.89, + "learning_rate": 1.68730612244898e-05, + "loss": 1.8938, + "step": 417500 + }, + { + "FLOPS loss": 0.0684627816081047, + "L0_d": 920.95, + "MLM loss": 1.9407403469085693, + "epoch": 8.9, + "step": 417999 + }, + { + "epoch": 8.9, + "learning_rate": 1.6771020408163267e-05, + "loss": 1.8937, + "step": 418000 + }, + { + "FLOPS loss": 0.06968861073255539, + "L0_d": 698.0, + "MLM loss": 1.9223194122314453, + "epoch": 8.91, + "step": 418499 + }, + { + "epoch": 8.91, + "learning_rate": 1.6668979591836735e-05, + "loss": 1.8959, + "step": 418500 + }, + { + "FLOPS loss": 0.07910454273223877, + "L0_d": 1309.33, + "MLM loss": 1.7841010093688965, + "epoch": 8.92, + "step": 418999 + }, + { + "epoch": 8.92, + "learning_rate": 1.656714285714286e-05, + "loss": 1.8974, + "step": 419000 + }, + { + "FLOPS loss": 0.06197473779320717, + "L0_d": 706.7, + "MLM loss": 1.8492615222930908, + "epoch": 8.93, + "step": 419499 + }, + { + "epoch": 8.93, + "learning_rate": 1.6465102040816327e-05, + "loss": 1.8932, + "step": 419500 + }, + { + "FLOPS loss": 0.058207958936691284, + "L0_d": 604.22, + "MLM loss": 1.8547852039337158, + "epoch": 8.94, + "step": 419999 + }, + { + "epoch": 8.94, + "learning_rate": 1.63630612244898e-05, + "loss": 1.8907, + "step": 420000 + }, + { + "FLOPS loss": 0.09094519168138504, + "L0_d": 902.23, + "MLM loss": 1.9144949913024902, + "epoch": 8.95, + "step": 420499 + }, + { + "epoch": 8.95, + "learning_rate": 1.6261020408163267e-05, + "loss": 1.8927, + "step": 420500 + }, + { + "FLOPS loss": 0.06928250938653946, + "L0_d": 714.59, + "MLM loss": 1.7490980625152588, + "epoch": 8.97, + "step": 420999 + }, + { + "epoch": 8.97, + "learning_rate": 1.6159183673469387e-05, + "loss": 1.8895, + "step": 421000 + }, + { + "FLOPS loss": 0.0713324099779129, + "L0_d": 694.98, + "MLM loss": 1.7904343605041504, + "epoch": 8.98, + "step": 421499 + }, + { + "epoch": 8.98, + "learning_rate": 1.6057142857142855e-05, + "loss": 1.8895, + "step": 421500 + }, + { + "FLOPS loss": 0.07053466886281967, + "L0_d": 647.95, + "MLM loss": 1.8427083492279053, + "epoch": 8.99, + "step": 421999 + }, + { + "epoch": 8.99, + "learning_rate": 1.5955102040816327e-05, + "loss": 1.8941, + "step": 422000 + }, + { + "FLOPS loss": 0.08139703422784805, + "L0_d": 1328.97, + "MLM loss": 1.7719063758850098, + "epoch": 9.0, + "step": 422499 + }, + { + "epoch": 9.0, + "learning_rate": 1.5853061224489795e-05, + "loss": 1.8931, + "step": 422500 + }, + { + "FLOPS loss": 0.06293307989835739, + "L0_d": 656.23, + "MLM loss": 1.8263726234436035, + "epoch": 9.01, + "step": 422999 + }, + { + "epoch": 9.01, + "learning_rate": 1.575122448979592e-05, + "loss": 1.89, + "step": 423000 + }, + { + "FLOPS loss": 0.08060479164123535, + "L0_d": 735.62, + "MLM loss": 1.697467565536499, + "epoch": 9.02, + "step": 423499 + }, + { + "epoch": 9.02, + "learning_rate": 1.564918367346939e-05, + "loss": 1.89, + "step": 423500 + }, + { + "FLOPS loss": 0.08011506497859955, + "L0_d": 729.72, + "MLM loss": 1.7792555093765259, + "epoch": 9.03, + "step": 423999 + }, + { + "epoch": 9.03, + "learning_rate": 1.554714285714286e-05, + "loss": 1.8894, + "step": 424000 + }, + { + "FLOPS loss": 0.07208956032991409, + "L0_d": 629.22, + "MLM loss": 1.8163645267486572, + "epoch": 9.04, + "step": 424499 + }, + { + "epoch": 9.04, + "learning_rate": 1.5445102040816327e-05, + "loss": 1.8884, + "step": 424500 + }, + { + "FLOPS loss": 0.06710517406463623, + "L0_d": 770.2, + "MLM loss": 1.7197039127349854, + "epoch": 9.05, + "step": 424999 + }, + { + "epoch": 9.05, + "learning_rate": 1.534326530612245e-05, + "loss": 1.89, + "step": 425000 + }, + { + "FLOPS loss": 0.09404978156089783, + "L0_d": 791.61, + "MLM loss": 1.7534675598144531, + "epoch": 9.06, + "step": 425499 + }, + { + "epoch": 9.06, + "learning_rate": 1.5241224489795919e-05, + "loss": 1.8884, + "step": 425500 + }, + { + "FLOPS loss": 0.07811388373374939, + "L0_d": 679.25, + "MLM loss": 1.8744385242462158, + "epoch": 9.07, + "step": 425999 + }, + { + "epoch": 9.07, + "learning_rate": 1.513918367346939e-05, + "loss": 1.8896, + "step": 426000 + }, + { + "FLOPS loss": 0.08018653094768524, + "L0_d": 643.23, + "MLM loss": 1.8160462379455566, + "epoch": 9.08, + "step": 426499 + }, + { + "epoch": 9.08, + "learning_rate": 1.5037142857142858e-05, + "loss": 1.8896, + "step": 426500 + }, + { + "FLOPS loss": 0.07460828870534897, + "L0_d": 683.14, + "MLM loss": 1.7526308298110962, + "epoch": 9.09, + "step": 426999 + }, + { + "epoch": 9.09, + "learning_rate": 1.4935102040816326e-05, + "loss": 1.8882, + "step": 427000 + }, + { + "FLOPS loss": 0.06290356814861298, + "L0_d": 624.78, + "MLM loss": 1.7872731685638428, + "epoch": 9.1, + "step": 427499 + }, + { + "epoch": 9.1, + "learning_rate": 1.4833061224489796e-05, + "loss": 1.8877, + "step": 427500 + }, + { + "FLOPS loss": 0.0771726593375206, + "L0_d": 798.14, + "MLM loss": 1.810828447341919, + "epoch": 9.11, + "step": 427999 + }, + { + "epoch": 9.11, + "learning_rate": 1.473122448979592e-05, + "loss": 1.883, + "step": 428000 + }, + { + "FLOPS loss": 0.07212334126234055, + "L0_d": 630.3, + "MLM loss": 1.8126250505447388, + "epoch": 9.13, + "step": 428499 + }, + { + "epoch": 9.13, + "learning_rate": 1.4629183673469388e-05, + "loss": 1.8931, + "step": 428500 + }, + { + "FLOPS loss": 0.06718722730875015, + "L0_d": 693.72, + "MLM loss": 1.8408608436584473, + "epoch": 9.14, + "step": 428999 + }, + { + "epoch": 9.14, + "learning_rate": 1.4527142857142858e-05, + "loss": 1.8907, + "step": 429000 + }, + { + "FLOPS loss": 0.07557131350040436, + "L0_d": 788.81, + "MLM loss": 1.761578917503357, + "epoch": 9.15, + "step": 429499 + }, + { + "epoch": 9.15, + "learning_rate": 1.4425102040816326e-05, + "loss": 1.8882, + "step": 429500 + }, + { + "FLOPS loss": 0.07352057844400406, + "L0_d": 762.31, + "MLM loss": 1.7296146154403687, + "epoch": 9.16, + "step": 429999 + }, + { + "epoch": 9.16, + "learning_rate": 1.4323265306122449e-05, + "loss": 1.8895, + "step": 430000 + }, + { + "FLOPS loss": 0.07733792811632156, + "L0_d": 806.64, + "MLM loss": 1.7991564273834229, + "epoch": 9.17, + "step": 430499 + }, + { + "epoch": 9.17, + "learning_rate": 1.422122448979592e-05, + "loss": 1.8896, + "step": 430500 + }, + { + "FLOPS loss": 0.08784882724285126, + "L0_d": 798.2, + "MLM loss": 1.869628667831421, + "epoch": 9.18, + "step": 430999 + }, + { + "epoch": 9.18, + "learning_rate": 1.4119183673469388e-05, + "loss": 1.8862, + "step": 431000 + }, + { + "FLOPS loss": 0.06718917936086655, + "L0_d": 759.0, + "MLM loss": 1.8948330879211426, + "epoch": 9.19, + "step": 431499 + }, + { + "epoch": 9.19, + "learning_rate": 1.4017142857142856e-05, + "loss": 1.8914, + "step": 431500 + }, + { + "FLOPS loss": 0.07390324771404266, + "L0_d": 652.94, + "MLM loss": 1.825411081314087, + "epoch": 9.2, + "step": 431999 + }, + { + "epoch": 9.2, + "learning_rate": 1.3915102040816328e-05, + "loss": 1.8882, + "step": 432000 + }, + { + "FLOPS loss": 0.0791919007897377, + "L0_d": 709.88, + "MLM loss": 1.9184443950653076, + "epoch": 9.21, + "step": 432499 + }, + { + "epoch": 9.21, + "learning_rate": 1.381326530612245e-05, + "loss": 1.8842, + "step": 432500 + }, + { + "FLOPS loss": 0.061726462095975876, + "L0_d": 551.17, + "MLM loss": 1.8523519039154053, + "epoch": 9.22, + "step": 432999 + }, + { + "epoch": 9.22, + "learning_rate": 1.3711224489795918e-05, + "loss": 1.889, + "step": 433000 + }, + { + "FLOPS loss": 0.07693947851657867, + "L0_d": 629.14, + "MLM loss": 1.7981572151184082, + "epoch": 9.23, + "step": 433499 + }, + { + "epoch": 9.23, + "learning_rate": 1.360918367346939e-05, + "loss": 1.886, + "step": 433500 + }, + { + "FLOPS loss": 0.07043065875768661, + "L0_d": 669.44, + "MLM loss": 1.8404135704040527, + "epoch": 9.24, + "step": 433999 + }, + { + "epoch": 9.24, + "learning_rate": 1.3507142857142858e-05, + "loss": 1.8868, + "step": 434000 + }, + { + "FLOPS loss": 0.06310301274061203, + "L0_d": 744.97, + "MLM loss": 1.7336649894714355, + "epoch": 9.25, + "step": 434499 + }, + { + "epoch": 9.25, + "learning_rate": 1.3405102040816328e-05, + "loss": 1.8908, + "step": 434500 + }, + { + "FLOPS loss": 0.06630953401327133, + "L0_d": 758.22, + "MLM loss": 1.7070391178131104, + "epoch": 9.26, + "step": 434999 + }, + { + "epoch": 9.26, + "learning_rate": 1.330326530612245e-05, + "loss": 1.8862, + "step": 435000 + }, + { + "FLOPS loss": 0.07912331819534302, + "L0_d": 704.33, + "MLM loss": 1.72645103931427, + "epoch": 9.27, + "step": 435499 + }, + { + "epoch": 9.27, + "learning_rate": 1.3201224489795918e-05, + "loss": 1.8879, + "step": 435500 + }, + { + "FLOPS loss": 0.061746422201395035, + "L0_d": 685.48, + "MLM loss": 1.7991679906845093, + "epoch": 9.28, + "step": 435999 + }, + { + "epoch": 9.28, + "learning_rate": 1.309918367346939e-05, + "loss": 1.8863, + "step": 436000 + }, + { + "FLOPS loss": 0.06622132658958435, + "L0_d": 738.52, + "MLM loss": 1.7106609344482422, + "epoch": 9.3, + "step": 436499 + }, + { + "epoch": 9.3, + "learning_rate": 1.2997142857142858e-05, + "loss": 1.8833, + "step": 436500 + }, + { + "FLOPS loss": 0.08270273357629776, + "L0_d": 1133.8, + "MLM loss": 1.7119030952453613, + "epoch": 9.31, + "step": 436999 + }, + { + "epoch": 9.31, + "learning_rate": 1.289530612244898e-05, + "loss": 1.8889, + "step": 437000 + }, + { + "FLOPS loss": 0.08029799908399582, + "L0_d": 887.58, + "MLM loss": 1.7202215194702148, + "epoch": 9.32, + "step": 437499 + }, + { + "epoch": 9.32, + "learning_rate": 1.2793265306122448e-05, + "loss": 1.8836, + "step": 437500 + }, + { + "FLOPS loss": 0.0647151842713356, + "L0_d": 644.31, + "MLM loss": 1.7943322658538818, + "epoch": 9.33, + "step": 437999 + }, + { + "epoch": 9.33, + "learning_rate": 1.269122448979592e-05, + "loss": 1.8865, + "step": 438000 + }, + { + "FLOPS loss": 0.09509658813476562, + "L0_d": 1019.94, + "MLM loss": 1.8161685466766357, + "epoch": 9.34, + "step": 438499 + }, + { + "epoch": 9.34, + "learning_rate": 1.2589183673469388e-05, + "loss": 1.8856, + "step": 438500 + }, + { + "FLOPS loss": 0.08688385784626007, + "L0_d": 775.3, + "MLM loss": 1.843897819519043, + "epoch": 9.35, + "step": 438999 + }, + { + "epoch": 9.35, + "learning_rate": 1.2487346938775512e-05, + "loss": 1.887, + "step": 439000 + }, + { + "FLOPS loss": 0.07664480060338974, + "L0_d": 716.61, + "MLM loss": 1.8365232944488525, + "epoch": 9.36, + "step": 439499 + }, + { + "epoch": 9.36, + "learning_rate": 1.2385306122448981e-05, + "loss": 1.8852, + "step": 439500 + }, + { + "FLOPS loss": 0.07584109157323837, + "L0_d": 1017.5, + "MLM loss": 1.8281240463256836, + "epoch": 9.37, + "step": 439999 + }, + { + "epoch": 9.37, + "learning_rate": 1.228326530612245e-05, + "loss": 1.8839, + "step": 440000 + }, + { + "FLOPS loss": 0.09187859296798706, + "L0_d": 768.38, + "MLM loss": 1.9023255109786987, + "epoch": 9.38, + "step": 440499 + }, + { + "epoch": 9.38, + "learning_rate": 1.218122448979592e-05, + "loss": 1.8842, + "step": 440500 + }, + { + "FLOPS loss": 0.07121729850769043, + "L0_d": 589.25, + "MLM loss": 1.7611678838729858, + "epoch": 9.39, + "step": 440999 + }, + { + "epoch": 9.39, + "learning_rate": 1.2079387755102042e-05, + "loss": 1.8823, + "step": 441000 + }, + { + "FLOPS loss": 0.07112989574670792, + "L0_d": 655.28, + "MLM loss": 1.781064748764038, + "epoch": 9.4, + "step": 441499 + }, + { + "epoch": 9.4, + "learning_rate": 1.197734693877551e-05, + "loss": 1.8905, + "step": 441500 + }, + { + "FLOPS loss": 0.06727603822946548, + "L0_d": 627.89, + "MLM loss": 1.7931089401245117, + "epoch": 9.41, + "step": 441999 + }, + { + "epoch": 9.41, + "learning_rate": 1.187530612244898e-05, + "loss": 1.8821, + "step": 442000 + }, + { + "FLOPS loss": 0.08041385561227798, + "L0_d": 826.0, + "MLM loss": 1.8660180568695068, + "epoch": 9.42, + "step": 442499 + }, + { + "epoch": 9.42, + "learning_rate": 1.177326530612245e-05, + "loss": 1.8863, + "step": 442500 + }, + { + "FLOPS loss": 0.06488444656133652, + "L0_d": 650.3, + "MLM loss": 1.5891389846801758, + "epoch": 9.43, + "step": 442999 + }, + { + "epoch": 9.43, + "learning_rate": 1.1671224489795919e-05, + "loss": 1.8857, + "step": 443000 + }, + { + "FLOPS loss": 0.07908573746681213, + "L0_d": 814.28, + "MLM loss": 1.8341909646987915, + "epoch": 9.44, + "step": 443499 + }, + { + "epoch": 9.44, + "learning_rate": 1.1569387755102042e-05, + "loss": 1.887, + "step": 443500 + }, + { + "FLOPS loss": 0.0627230852842331, + "L0_d": 725.02, + "MLM loss": 1.9674441814422607, + "epoch": 9.46, + "step": 443999 + }, + { + "epoch": 9.46, + "learning_rate": 1.1467346938775511e-05, + "loss": 1.8837, + "step": 444000 + }, + { + "FLOPS loss": 0.07779854536056519, + "L0_d": 792.72, + "MLM loss": 1.77005934715271, + "epoch": 9.47, + "step": 444499 + }, + { + "epoch": 9.47, + "learning_rate": 1.1365306122448981e-05, + "loss": 1.8831, + "step": 444500 + }, + { + "FLOPS loss": 0.06868050247430801, + "L0_d": 651.2, + "MLM loss": 1.7083113193511963, + "epoch": 9.48, + "step": 444999 + }, + { + "epoch": 9.48, + "learning_rate": 1.1263265306122449e-05, + "loss": 1.884, + "step": 445000 + }, + { + "FLOPS loss": 0.07770167291164398, + "L0_d": 732.28, + "MLM loss": 1.8704335689544678, + "epoch": 9.49, + "step": 445499 + }, + { + "epoch": 9.49, + "learning_rate": 1.1161224489795919e-05, + "loss": 1.8882, + "step": 445500 + }, + { + "FLOPS loss": 0.07393424212932587, + "L0_d": 602.7, + "MLM loss": 1.8852603435516357, + "epoch": 9.5, + "step": 445999 + }, + { + "epoch": 9.5, + "learning_rate": 1.1059387755102041e-05, + "loss": 1.8845, + "step": 446000 + }, + { + "FLOPS loss": 0.061653368175029755, + "L0_d": 612.22, + "MLM loss": 1.9074640274047852, + "epoch": 9.51, + "step": 446499 + }, + { + "epoch": 9.51, + "learning_rate": 1.0957346938775511e-05, + "loss": 1.8839, + "step": 446500 + }, + { + "FLOPS loss": 0.08355122804641724, + "L0_d": 753.42, + "MLM loss": 1.7888277769088745, + "epoch": 9.52, + "step": 446999 + }, + { + "epoch": 9.52, + "learning_rate": 1.085530612244898e-05, + "loss": 1.8827, + "step": 447000 + }, + { + "FLOPS loss": 0.07092072814702988, + "L0_d": 697.31, + "MLM loss": 1.7585101127624512, + "epoch": 9.53, + "step": 447499 + }, + { + "epoch": 9.53, + "learning_rate": 1.0753265306122449e-05, + "loss": 1.8809, + "step": 447500 + }, + { + "FLOPS loss": 0.07540085166692734, + "L0_d": 741.02, + "MLM loss": 1.7078624963760376, + "epoch": 9.54, + "step": 447999 + }, + { + "epoch": 9.54, + "learning_rate": 1.0651428571428571e-05, + "loss": 1.8872, + "step": 448000 + }, + { + "FLOPS loss": 0.0650225356221199, + "L0_d": 752.22, + "MLM loss": 1.9339125156402588, + "epoch": 9.55, + "step": 448499 + }, + { + "epoch": 9.55, + "learning_rate": 1.0549387755102041e-05, + "loss": 1.8863, + "step": 448500 + }, + { + "FLOPS loss": 0.06345243006944656, + "L0_d": 790.52, + "MLM loss": 1.9979722499847412, + "epoch": 9.56, + "step": 448999 + }, + { + "epoch": 9.56, + "learning_rate": 1.0447346938775511e-05, + "loss": 1.8843, + "step": 449000 + }, + { + "FLOPS loss": 0.06612099707126617, + "L0_d": 640.97, + "MLM loss": 1.8012644052505493, + "epoch": 9.57, + "step": 449499 + }, + { + "epoch": 9.57, + "learning_rate": 1.034530612244898e-05, + "loss": 1.8834, + "step": 449500 + }, + { + "FLOPS loss": 0.06571657210588455, + "L0_d": 709.7, + "MLM loss": 1.732346773147583, + "epoch": 9.58, + "step": 449999 + }, + { + "epoch": 9.58, + "learning_rate": 1.024326530612245e-05, + "loss": 1.8825, + "step": 450000 + }, + { + "FLOPS loss": 0.09524022042751312, + "L0_d": 713.88, + "MLM loss": 1.8579025268554688, + "epoch": 9.59, + "step": 450499 + }, + { + "epoch": 9.59, + "learning_rate": 1.0141428571428573e-05, + "loss": 1.8793, + "step": 450500 + }, + { + "FLOPS loss": 0.08708564192056656, + "L0_d": 764.3, + "MLM loss": 1.7488106489181519, + "epoch": 9.6, + "step": 450999 + }, + { + "epoch": 9.6, + "learning_rate": 1.0039387755102041e-05, + "loss": 1.8842, + "step": 451000 + }, + { + "FLOPS loss": 0.06419456005096436, + "L0_d": 609.41, + "MLM loss": 1.728226661682129, + "epoch": 9.61, + "step": 451499 + }, + { + "epoch": 9.61, + "learning_rate": 9.93734693877551e-06, + "loss": 1.8823, + "step": 451500 + }, + { + "FLOPS loss": 0.07620800286531448, + "L0_d": 819.53, + "MLM loss": 1.7481298446655273, + "epoch": 9.63, + "step": 451999 + }, + { + "epoch": 9.63, + "learning_rate": 9.83530612244898e-06, + "loss": 1.8809, + "step": 452000 + }, + { + "FLOPS loss": 0.07101784646511078, + "L0_d": 665.31, + "MLM loss": 1.7855209112167358, + "epoch": 9.64, + "step": 452499 + }, + { + "epoch": 9.64, + "learning_rate": 9.733469387755103e-06, + "loss": 1.8854, + "step": 452500 + }, + { + "FLOPS loss": 0.07623369246721268, + "L0_d": 771.8, + "MLM loss": 1.7835557460784912, + "epoch": 9.65, + "step": 452999 + }, + { + "epoch": 9.65, + "learning_rate": 9.631428571428573e-06, + "loss": 1.8816, + "step": 453000 + }, + { + "FLOPS loss": 0.08137229084968567, + "L0_d": 841.5, + "MLM loss": 1.8653662204742432, + "epoch": 9.66, + "step": 453499 + }, + { + "epoch": 9.66, + "learning_rate": 9.52938775510204e-06, + "loss": 1.8856, + "step": 453500 + }, + { + "FLOPS loss": 0.0767243355512619, + "L0_d": 1113.84, + "MLM loss": 1.8295722007751465, + "epoch": 9.67, + "step": 453999 + }, + { + "epoch": 9.67, + "learning_rate": 9.42734693877551e-06, + "loss": 1.8776, + "step": 454000 + }, + { + "FLOPS loss": 0.078528992831707, + "L0_d": 882.05, + "MLM loss": 1.8933274745941162, + "epoch": 9.68, + "step": 454499 + }, + { + "epoch": 9.68, + "learning_rate": 9.325510204081633e-06, + "loss": 1.8817, + "step": 454500 + }, + { + "FLOPS loss": 0.07128248363733292, + "L0_d": 683.69, + "MLM loss": 1.6863057613372803, + "epoch": 9.69, + "step": 454999 + }, + { + "epoch": 9.69, + "learning_rate": 9.223469387755103e-06, + "loss": 1.8791, + "step": 455000 + }, + { + "FLOPS loss": 0.08068500459194183, + "L0_d": 753.53, + "MLM loss": 1.7771949768066406, + "epoch": 9.7, + "step": 455499 + }, + { + "epoch": 9.7, + "learning_rate": 9.121428571428572e-06, + "loss": 1.8827, + "step": 455500 + }, + { + "FLOPS loss": 0.06115632876753807, + "L0_d": 648.83, + "MLM loss": 1.7620165348052979, + "epoch": 9.71, + "step": 455999 + }, + { + "epoch": 9.71, + "learning_rate": 9.01938775510204e-06, + "loss": 1.8839, + "step": 456000 + }, + { + "FLOPS loss": 0.08328636735677719, + "L0_d": 747.77, + "MLM loss": 1.7784515619277954, + "epoch": 9.72, + "step": 456499 + }, + { + "epoch": 9.72, + "learning_rate": 8.91734693877551e-06, + "loss": 1.8827, + "step": 456500 + }, + { + "FLOPS loss": 0.07326602935791016, + "L0_d": 676.11, + "MLM loss": 1.8248202800750732, + "epoch": 9.73, + "step": 456999 + }, + { + "epoch": 9.73, + "learning_rate": 8.815510204081633e-06, + "loss": 1.8811, + "step": 457000 + }, + { + "FLOPS loss": 0.06962653249502182, + "L0_d": 670.75, + "MLM loss": 1.9677387475967407, + "epoch": 9.74, + "step": 457499 + }, + { + "epoch": 9.74, + "learning_rate": 8.713469387755102e-06, + "loss": 1.8818, + "step": 457500 + }, + { + "FLOPS loss": 0.0688956156373024, + "L0_d": 924.66, + "MLM loss": 1.8603427410125732, + "epoch": 9.75, + "step": 457999 + }, + { + "epoch": 9.75, + "learning_rate": 8.611428571428572e-06, + "loss": 1.8859, + "step": 458000 + }, + { + "FLOPS loss": 0.07736175507307053, + "L0_d": 1061.88, + "MLM loss": 1.8138039112091064, + "epoch": 9.76, + "step": 458499 + }, + { + "epoch": 9.76, + "learning_rate": 8.509387755102042e-06, + "loss": 1.8817, + "step": 458500 + }, + { + "FLOPS loss": 0.0671883374452591, + "L0_d": 722.67, + "MLM loss": 1.7381740808486938, + "epoch": 9.77, + "step": 458999 + }, + { + "epoch": 9.77, + "learning_rate": 8.407551020408164e-06, + "loss": 1.8798, + "step": 459000 + }, + { + "FLOPS loss": 0.07179486751556396, + "L0_d": 825.41, + "MLM loss": 1.8373209238052368, + "epoch": 9.79, + "step": 459499 + }, + { + "epoch": 9.79, + "learning_rate": 8.305510204081632e-06, + "loss": 1.8839, + "step": 459500 + }, + { + "FLOPS loss": 0.059714049100875854, + "L0_d": 530.77, + "MLM loss": 1.7862880229949951, + "epoch": 9.8, + "step": 459999 + }, + { + "epoch": 9.8, + "learning_rate": 8.203469387755102e-06, + "loss": 1.8817, + "step": 460000 + }, + { + "FLOPS loss": 0.0783819779753685, + "L0_d": 960.45, + "MLM loss": 1.8646221160888672, + "epoch": 9.81, + "step": 460499 + }, + { + "epoch": 9.81, + "learning_rate": 8.101428571428572e-06, + "loss": 1.879, + "step": 460500 + }, + { + "FLOPS loss": 0.06765973567962646, + "L0_d": 635.81, + "MLM loss": 1.9023945331573486, + "epoch": 9.82, + "step": 460999 + }, + { + "epoch": 9.82, + "learning_rate": 7.999591836734694e-06, + "loss": 1.8803, + "step": 461000 + }, + { + "FLOPS loss": 0.06713356822729111, + "L0_d": 643.22, + "MLM loss": 1.6709318161010742, + "epoch": 9.83, + "step": 461499 + }, + { + "epoch": 9.83, + "learning_rate": 7.897551020408164e-06, + "loss": 1.8788, + "step": 461500 + }, + { + "FLOPS loss": 0.06832224130630493, + "L0_d": 696.81, + "MLM loss": 1.7683454751968384, + "epoch": 9.84, + "step": 461999 + }, + { + "epoch": 9.84, + "learning_rate": 7.795510204081632e-06, + "loss": 1.8791, + "step": 462000 + }, + { + "FLOPS loss": 0.08137806504964828, + "L0_d": 629.3, + "MLM loss": 1.7359639406204224, + "epoch": 9.85, + "step": 462499 + }, + { + "epoch": 9.85, + "learning_rate": 7.693469387755102e-06, + "loss": 1.8806, + "step": 462500 + }, + { + "FLOPS loss": 0.07944727689027786, + "L0_d": 1020.34, + "MLM loss": 1.6010856628417969, + "epoch": 9.86, + "step": 462999 + }, + { + "epoch": 9.86, + "learning_rate": 7.591428571428572e-06, + "loss": 1.8793, + "step": 463000 + }, + { + "FLOPS loss": 0.06799176335334778, + "L0_d": 662.75, + "MLM loss": 1.7740750312805176, + "epoch": 9.87, + "step": 463499 + }, + { + "epoch": 9.87, + "learning_rate": 7.489387755102041e-06, + "loss": 1.8779, + "step": 463500 + }, + { + "FLOPS loss": 0.08271846920251846, + "L0_d": 764.22, + "MLM loss": 1.8342242240905762, + "epoch": 9.88, + "step": 463999 + }, + { + "epoch": 9.88, + "learning_rate": 7.387551020408163e-06, + "loss": 1.876, + "step": 464000 + }, + { + "FLOPS loss": 0.0854814201593399, + "L0_d": 922.66, + "MLM loss": 1.876578450202942, + "epoch": 9.89, + "step": 464499 + }, + { + "epoch": 9.89, + "learning_rate": 7.285510204081633e-06, + "loss": 1.8791, + "step": 464500 + }, + { + "FLOPS loss": 0.10036221146583557, + "L0_d": 907.89, + "MLM loss": 1.777255654335022, + "epoch": 9.9, + "step": 464999 + }, + { + "epoch": 9.9, + "learning_rate": 7.183469387755103e-06, + "loss": 1.8797, + "step": 465000 + }, + { + "FLOPS loss": 0.06955206394195557, + "L0_d": 749.52, + "MLM loss": 1.621423363685608, + "epoch": 9.91, + "step": 465499 + }, + { + "epoch": 9.91, + "learning_rate": 7.0814285714285725e-06, + "loss": 1.876, + "step": 465500 + }, + { + "FLOPS loss": 0.06892334669828415, + "L0_d": 697.23, + "MLM loss": 1.743448257446289, + "epoch": 9.92, + "step": 465999 + }, + { + "epoch": 9.92, + "learning_rate": 6.979591836734695e-06, + "loss": 1.8819, + "step": 466000 + }, + { + "FLOPS loss": 0.0751122310757637, + "L0_d": 797.48, + "MLM loss": 1.9870507717132568, + "epoch": 9.93, + "step": 466499 + }, + { + "epoch": 9.93, + "learning_rate": 6.877551020408164e-06, + "loss": 1.8828, + "step": 466500 + }, + { + "FLOPS loss": 0.08745686709880829, + "L0_d": 953.25, + "MLM loss": 1.8826556205749512, + "epoch": 9.95, + "step": 466999 + }, + { + "epoch": 9.95, + "learning_rate": 6.775510204081633e-06, + "loss": 1.878, + "step": 467000 + }, + { + "FLOPS loss": 0.07517757266759872, + "L0_d": 826.22, + "MLM loss": 1.7692055702209473, + "epoch": 9.96, + "step": 467499 + }, + { + "epoch": 9.96, + "learning_rate": 6.673469387755102e-06, + "loss": 1.8798, + "step": 467500 + }, + { + "FLOPS loss": 0.08363378793001175, + "L0_d": 745.77, + "MLM loss": 1.752866506576538, + "epoch": 9.97, + "step": 467999 + }, + { + "epoch": 9.97, + "learning_rate": 6.571632653061224e-06, + "loss": 1.8782, + "step": 468000 + }, + { + "FLOPS loss": 0.05404847860336304, + "L0_d": 604.16, + "MLM loss": 1.909287691116333, + "epoch": 9.98, + "step": 468499 + }, + { + "epoch": 9.98, + "learning_rate": 6.469591836734694e-06, + "loss": 1.8779, + "step": 468500 + }, + { + "FLOPS loss": 0.09000225365161896, + "L0_d": 819.59, + "MLM loss": 1.7899792194366455, + "epoch": 9.99, + "step": 468999 + }, + { + "epoch": 9.99, + "learning_rate": 6.367551020408164e-06, + "loss": 1.8752, + "step": 469000 + }, + { + "FLOPS loss": 0.07309328764677048, + "L0_d": 703.8, + "MLM loss": 1.8317029476165771, + "epoch": 10.0, + "step": 469499 + }, + { + "epoch": 10.0, + "learning_rate": 6.265510204081633e-06, + "loss": 1.8769, + "step": 469500 + }, + { + "FLOPS loss": 0.06383686512708664, + "L0_d": 687.62, + "MLM loss": 1.7126272916793823, + "epoch": 10.01, + "step": 469999 + }, + { + "epoch": 10.01, + "learning_rate": 6.163469387755102e-06, + "loss": 1.8771, + "step": 470000 + }, + { + "FLOPS loss": 0.07466043531894684, + "L0_d": 799.62, + "MLM loss": 1.8500142097473145, + "epoch": 10.02, + "step": 470499 + }, + { + "epoch": 10.02, + "learning_rate": 6.061428571428571e-06, + "loss": 1.8747, + "step": 470500 + }, + { + "FLOPS loss": 0.07076152414083481, + "L0_d": 837.88, + "MLM loss": 1.8527822494506836, + "epoch": 10.03, + "step": 470999 + }, + { + "epoch": 10.03, + "learning_rate": 5.959591836734694e-06, + "loss": 1.8736, + "step": 471000 + }, + { + "FLOPS loss": 0.07877156138420105, + "L0_d": 717.0, + "MLM loss": 1.727238416671753, + "epoch": 10.04, + "step": 471499 + }, + { + "epoch": 10.04, + "learning_rate": 5.857551020408163e-06, + "loss": 1.8719, + "step": 471500 + }, + { + "FLOPS loss": 0.07525794953107834, + "L0_d": 743.8, + "MLM loss": 1.7471095323562622, + "epoch": 10.05, + "step": 471999 + }, + { + "epoch": 10.05, + "learning_rate": 5.755510204081633e-06, + "loss": 1.8739, + "step": 472000 + }, + { + "FLOPS loss": 0.06217565760016441, + "L0_d": 702.58, + "MLM loss": 1.8815577030181885, + "epoch": 10.06, + "step": 472499 + }, + { + "epoch": 10.06, + "learning_rate": 5.653469387755102e-06, + "loss": 1.8783, + "step": 472500 + }, + { + "FLOPS loss": 0.08705900609493256, + "L0_d": 895.94, + "MLM loss": 1.7565627098083496, + "epoch": 10.07, + "step": 472999 + }, + { + "epoch": 10.07, + "learning_rate": 5.5516326530612245e-06, + "loss": 1.8748, + "step": 473000 + }, + { + "FLOPS loss": 0.056544363498687744, + "L0_d": 588.73, + "MLM loss": 1.7726197242736816, + "epoch": 10.08, + "step": 473499 + }, + { + "epoch": 10.08, + "learning_rate": 5.449591836734694e-06, + "loss": 1.8741, + "step": 473500 + }, + { + "FLOPS loss": 0.06269951909780502, + "L0_d": 648.5, + "MLM loss": 1.9060299396514893, + "epoch": 10.09, + "step": 473999 + }, + { + "epoch": 10.09, + "learning_rate": 5.347551020408163e-06, + "loss": 1.8779, + "step": 474000 + }, + { + "FLOPS loss": 0.08251946419477463, + "L0_d": 952.05, + "MLM loss": 1.7872275114059448, + "epoch": 10.1, + "step": 474499 + }, + { + "epoch": 10.1, + "learning_rate": 5.245510204081633e-06, + "loss": 1.8746, + "step": 474500 + }, + { + "FLOPS loss": 0.07168962806463242, + "L0_d": 652.42, + "MLM loss": 1.715721845626831, + "epoch": 10.12, + "step": 474999 + }, + { + "epoch": 10.12, + "learning_rate": 5.143469387755103e-06, + "loss": 1.8689, + "step": 475000 + }, + { + "FLOPS loss": 0.06996005773544312, + "L0_d": 860.59, + "MLM loss": 1.854485034942627, + "epoch": 10.13, + "step": 475499 + }, + { + "epoch": 10.13, + "learning_rate": 5.041632653061225e-06, + "loss": 1.8754, + "step": 475500 + }, + { + "FLOPS loss": 0.0643070861697197, + "L0_d": 657.98, + "MLM loss": 1.689789056777954, + "epoch": 10.14, + "step": 475999 + }, + { + "epoch": 10.14, + "learning_rate": 4.939591836734694e-06, + "loss": 1.8756, + "step": 476000 + }, + { + "FLOPS loss": 0.08355584740638733, + "L0_d": 897.23, + "MLM loss": 1.758540153503418, + "epoch": 10.15, + "step": 476499 + }, + { + "epoch": 10.15, + "learning_rate": 4.837551020408163e-06, + "loss": 1.8762, + "step": 476500 + }, + { + "FLOPS loss": 0.08008190989494324, + "L0_d": 704.61, + "MLM loss": 1.6079193353652954, + "epoch": 10.16, + "step": 476999 + }, + { + "epoch": 10.16, + "learning_rate": 4.735510204081633e-06, + "loss": 1.8762, + "step": 477000 + }, + { + "FLOPS loss": 0.06911767274141312, + "L0_d": 871.81, + "MLM loss": 1.8158210515975952, + "epoch": 10.17, + "step": 477499 + }, + { + "epoch": 10.17, + "learning_rate": 4.633673469387755e-06, + "loss": 1.8744, + "step": 477500 + }, + { + "FLOPS loss": 0.08581630885601044, + "L0_d": 720.11, + "MLM loss": 1.918521523475647, + "epoch": 10.18, + "step": 477999 + }, + { + "epoch": 10.18, + "learning_rate": 4.531632653061225e-06, + "loss": 1.8723, + "step": 478000 + }, + { + "FLOPS loss": 0.09056857973337173, + "L0_d": 878.39, + "MLM loss": 1.8207356929779053, + "epoch": 10.19, + "step": 478499 + }, + { + "epoch": 10.19, + "learning_rate": 4.429591836734695e-06, + "loss": 1.8751, + "step": 478500 + }, + { + "FLOPS loss": 0.08066496253013611, + "L0_d": 708.09, + "MLM loss": 1.8024837970733643, + "epoch": 10.2, + "step": 478999 + }, + { + "epoch": 10.2, + "learning_rate": 4.327551020408163e-06, + "loss": 1.873, + "step": 479000 + }, + { + "FLOPS loss": 0.08622293174266815, + "L0_d": 819.22, + "MLM loss": 1.7358485460281372, + "epoch": 10.21, + "step": 479499 + }, + { + "epoch": 10.21, + "learning_rate": 4.225714285714286e-06, + "loss": 1.878, + "step": 479500 + }, + { + "FLOPS loss": 0.08763483911752701, + "L0_d": 772.67, + "MLM loss": 1.7484935522079468, + "epoch": 10.22, + "step": 479999 + }, + { + "epoch": 10.22, + "learning_rate": 4.123673469387755e-06, + "loss": 1.8734, + "step": 480000 + }, + { + "FLOPS loss": 0.06542215496301651, + "L0_d": 816.7, + "MLM loss": 1.807370662689209, + "epoch": 10.23, + "step": 480499 + }, + { + "epoch": 10.23, + "learning_rate": 4.021632653061225e-06, + "loss": 1.874, + "step": 480500 + }, + { + "FLOPS loss": 0.06360576301813126, + "L0_d": 633.91, + "MLM loss": 1.731799840927124, + "epoch": 10.24, + "step": 480999 + }, + { + "epoch": 10.24, + "learning_rate": 3.9195918367346945e-06, + "loss": 1.8732, + "step": 481000 + }, + { + "FLOPS loss": 0.06134741008281708, + "L0_d": 653.84, + "MLM loss": 1.7690579891204834, + "epoch": 10.25, + "step": 481499 + }, + { + "epoch": 10.25, + "learning_rate": 3.817755102040817e-06, + "loss": 1.8712, + "step": 481500 + }, + { + "FLOPS loss": 0.06777947396039963, + "L0_d": 1139.48, + "MLM loss": 1.770931601524353, + "epoch": 10.26, + "step": 481999 + }, + { + "epoch": 10.26, + "learning_rate": 3.7157142857142854e-06, + "loss": 1.8771, + "step": 482000 + }, + { + "FLOPS loss": 0.05449703335762024, + "L0_d": 670.72, + "MLM loss": 1.8757489919662476, + "epoch": 10.28, + "step": 482499 + }, + { + "epoch": 10.28, + "learning_rate": 3.613673469387755e-06, + "loss": 1.8739, + "step": 482500 + }, + { + "FLOPS loss": 0.06846970319747925, + "L0_d": 700.67, + "MLM loss": 1.851085901260376, + "epoch": 10.29, + "step": 482999 + }, + { + "epoch": 10.29, + "learning_rate": 3.5116326530612245e-06, + "loss": 1.871, + "step": 483000 + }, + { + "FLOPS loss": 0.0666121169924736, + "L0_d": 764.92, + "MLM loss": 1.8932390213012695, + "epoch": 10.3, + "step": 483499 + }, + { + "epoch": 10.3, + "learning_rate": 3.4095918367346943e-06, + "loss": 1.8723, + "step": 483500 + }, + { + "FLOPS loss": 0.06457451730966568, + "L0_d": 554.03, + "MLM loss": 1.8126370906829834, + "epoch": 10.31, + "step": 483999 + }, + { + "epoch": 10.31, + "learning_rate": 3.3077551020408167e-06, + "loss": 1.8745, + "step": 484000 + }, + { + "FLOPS loss": 0.09475447982549667, + "L0_d": 946.62, + "MLM loss": 1.9367070198059082, + "epoch": 10.32, + "step": 484499 + }, + { + "epoch": 10.32, + "learning_rate": 3.205714285714286e-06, + "loss": 1.8732, + "step": 484500 + }, + { + "FLOPS loss": 0.07273074239492416, + "L0_d": 770.86, + "MLM loss": 1.8612580299377441, + "epoch": 10.33, + "step": 484999 + }, + { + "epoch": 10.33, + "learning_rate": 3.1036734693877554e-06, + "loss": 1.8739, + "step": 485000 + }, + { + "FLOPS loss": 0.06570863723754883, + "L0_d": 606.78, + "MLM loss": 1.8435473442077637, + "epoch": 10.34, + "step": 485499 + }, + { + "epoch": 10.34, + "learning_rate": 3.0016326530612248e-06, + "loss": 1.8713, + "step": 485500 + }, + { + "FLOPS loss": 0.07231251895427704, + "L0_d": 614.42, + "MLM loss": 1.7718569040298462, + "epoch": 10.35, + "step": 485999 + }, + { + "epoch": 10.35, + "learning_rate": 2.899795918367347e-06, + "loss": 1.8732, + "step": 486000 + }, + { + "FLOPS loss": 0.06499864161014557, + "L0_d": 571.12, + "MLM loss": 1.8375295400619507, + "epoch": 10.36, + "step": 486499 + }, + { + "epoch": 10.36, + "learning_rate": 2.7977551020408165e-06, + "loss": 1.8758, + "step": 486500 + }, + { + "FLOPS loss": 0.06405822932720184, + "L0_d": 1054.19, + "MLM loss": 1.9204262495040894, + "epoch": 10.37, + "step": 486999 + }, + { + "epoch": 10.37, + "learning_rate": 2.695714285714286e-06, + "loss": 1.8758, + "step": 487000 + }, + { + "FLOPS loss": 0.07824577391147614, + "L0_d": 723.75, + "MLM loss": 1.8445980548858643, + "epoch": 10.38, + "step": 487499 + }, + { + "epoch": 10.38, + "learning_rate": 2.5936734693877552e-06, + "loss": 1.8742, + "step": 487500 + }, + { + "FLOPS loss": 0.07497908174991608, + "L0_d": 718.8, + "MLM loss": 1.6689207553863525, + "epoch": 10.39, + "step": 487999 + }, + { + "epoch": 10.39, + "learning_rate": 2.4916326530612246e-06, + "loss": 1.8707, + "step": 488000 + }, + { + "FLOPS loss": 0.07021187990903854, + "L0_d": 646.23, + "MLM loss": 1.7981057167053223, + "epoch": 10.4, + "step": 488499 + }, + { + "epoch": 10.4, + "learning_rate": 2.389795918367347e-06, + "loss": 1.8736, + "step": 488500 + }, + { + "FLOPS loss": 0.07197225093841553, + "L0_d": 839.08, + "MLM loss": 1.9683432579040527, + "epoch": 10.41, + "step": 488999 + }, + { + "epoch": 10.41, + "learning_rate": 2.2877551020408167e-06, + "loss": 1.8734, + "step": 489000 + }, + { + "FLOPS loss": 0.06169842928647995, + "L0_d": 680.11, + "MLM loss": 1.848118543624878, + "epoch": 10.42, + "step": 489499 + }, + { + "epoch": 10.42, + "learning_rate": 2.1857142857142857e-06, + "loss": 1.8705, + "step": 489500 + }, + { + "FLOPS loss": 0.06455899775028229, + "L0_d": 807.06, + "MLM loss": 1.9502975940704346, + "epoch": 10.43, + "step": 489999 + }, + { + "epoch": 10.43, + "learning_rate": 2.0836734693877554e-06, + "loss": 1.8699, + "step": 490000 + }, + { + "FLOPS loss": 0.08180254697799683, + "L0_d": 711.53, + "MLM loss": 1.8077306747436523, + "epoch": 10.45, + "step": 490499 + }, + { + "epoch": 10.45, + "learning_rate": 1.9816326530612244e-06, + "loss": 1.8721, + "step": 490500 + }, + { + "FLOPS loss": 0.06761618703603745, + "L0_d": 811.06, + "MLM loss": 1.7293885946273804, + "epoch": 10.46, + "step": 490999 + }, + { + "epoch": 10.46, + "learning_rate": 1.879795918367347e-06, + "loss": 1.8726, + "step": 491000 + }, + { + "FLOPS loss": 0.08343058079481125, + "L0_d": 1113.06, + "MLM loss": 1.7562888860702515, + "epoch": 10.47, + "step": 491499 + }, + { + "epoch": 10.47, + "learning_rate": 1.7777551020408165e-06, + "loss": 1.8713, + "step": 491500 + }, + { + "FLOPS loss": 0.07647604495286942, + "L0_d": 812.47, + "MLM loss": 1.814285159111023, + "epoch": 10.48, + "step": 491999 + }, + { + "epoch": 10.48, + "learning_rate": 1.6757142857142857e-06, + "loss": 1.8729, + "step": 492000 + }, + { + "FLOPS loss": 0.0742335394024849, + "L0_d": 725.94, + "MLM loss": 1.7169965505599976, + "epoch": 10.49, + "step": 492499 + }, + { + "epoch": 10.49, + "learning_rate": 1.5736734693877552e-06, + "loss": 1.8707, + "step": 492500 + }, + { + "FLOPS loss": 0.07749656587839127, + "L0_d": 740.7, + "MLM loss": 1.8546206951141357, + "epoch": 10.5, + "step": 492999 + }, + { + "epoch": 10.5, + "learning_rate": 1.4718367346938776e-06, + "loss": 1.8758, + "step": 493000 + }, + { + "FLOPS loss": 0.05815978720784187, + "L0_d": 632.03, + "MLM loss": 1.906503438949585, + "epoch": 10.51, + "step": 493499 + }, + { + "epoch": 10.51, + "learning_rate": 1.369795918367347e-06, + "loss": 1.8754, + "step": 493500 + }, + { + "FLOPS loss": 0.05534592270851135, + "L0_d": 531.83, + "MLM loss": 1.8600013256072998, + "epoch": 10.52, + "step": 493999 + }, + { + "epoch": 10.52, + "learning_rate": 1.2677551020408163e-06, + "loss": 1.8691, + "step": 494000 + }, + { + "FLOPS loss": 0.07069820165634155, + "L0_d": 716.92, + "MLM loss": 1.6526392698287964, + "epoch": 10.53, + "step": 494499 + }, + { + "epoch": 10.53, + "learning_rate": 1.1657142857142857e-06, + "loss": 1.8698, + "step": 494500 + }, + { + "FLOPS loss": 0.08306233584880829, + "L0_d": 738.41, + "MLM loss": 1.822631597518921, + "epoch": 10.54, + "step": 494999 + }, + { + "epoch": 10.54, + "learning_rate": 1.0638775510204083e-06, + "loss": 1.8754, + "step": 495000 + }, + { + "FLOPS loss": 0.0737542062997818, + "L0_d": 657.59, + "MLM loss": 1.7241982221603394, + "epoch": 10.55, + "step": 495499 + }, + { + "epoch": 10.55, + "learning_rate": 9.618367346938776e-07, + "loss": 1.8712, + "step": 495500 + }, + { + "FLOPS loss": 0.08165741711854935, + "L0_d": 817.86, + "MLM loss": 1.7989674806594849, + "epoch": 10.56, + "step": 495999 + }, + { + "epoch": 10.56, + "learning_rate": 8.59795918367347e-07, + "loss": 1.8706, + "step": 496000 + }, + { + "FLOPS loss": 0.0812465101480484, + "L0_d": 775.12, + "MLM loss": 1.8955512046813965, + "epoch": 10.57, + "step": 496499 + }, + { + "epoch": 10.57, + "learning_rate": 7.577551020408163e-07, + "loss": 1.8724, + "step": 496500 + }, + { + "FLOPS loss": 0.08214175701141357, + "L0_d": 734.98, + "MLM loss": 1.8664565086364746, + "epoch": 10.58, + "step": 496999 + }, + { + "epoch": 10.58, + "learning_rate": 6.557142857142857e-07, + "loss": 1.8723, + "step": 497000 + }, + { + "FLOPS loss": 0.07855507731437683, + "L0_d": 850.05, + "MLM loss": 1.7129387855529785, + "epoch": 10.59, + "step": 497499 + }, + { + "epoch": 10.59, + "learning_rate": 5.538775510204082e-07, + "loss": 1.8718, + "step": 497500 + }, + { + "FLOPS loss": 0.069860078394413, + "L0_d": 696.36, + "MLM loss": 1.6870092153549194, + "epoch": 10.61, + "step": 497999 + }, + { + "epoch": 10.61, + "learning_rate": 4.5183673469387754e-07, + "loss": 1.8679, + "step": 498000 + }, + { + "FLOPS loss": 0.065401092171669, + "L0_d": 664.92, + "MLM loss": 1.7763365507125854, + "epoch": 10.62, + "step": 498499 + }, + { + "epoch": 10.62, + "learning_rate": 3.4979591836734695e-07, + "loss": 1.8715, + "step": 498500 + }, + { + "FLOPS loss": 0.07126881182193756, + "L0_d": 922.78, + "MLM loss": 1.885299563407898, + "epoch": 10.63, + "step": 498999 + }, + { + "epoch": 10.63, + "learning_rate": 2.477551020408163e-07, + "loss": 1.8741, + "step": 499000 + }, + { + "FLOPS loss": 0.07045704871416092, + "L0_d": 685.06, + "MLM loss": 1.7598927021026611, + "epoch": 10.64, + "step": 499499 + }, + { + "epoch": 10.64, + "learning_rate": 1.457142857142857e-07, + "loss": 1.8754, + "step": 499500 + }, + { + "FLOPS loss": 0.07906313985586166, + "L0_d": 1157.08, + "MLM loss": 1.733157753944397, + "epoch": 10.65, + "step": 499999 + }, + { + "epoch": 10.65, + "learning_rate": 4.3877551020408164e-08, + "loss": 1.8715, + "step": 500000 + }, + { + "epoch": 10.65, + "step": 500001, + "total_flos": 8.484077462446146e+18, + "train_loss": 3.6746194780010343e-06, + "train_runtime": 301.9815, + "train_samples_per_second": 423867.085, + "train_steps_per_second": 1655.731 + } + ], + "max_steps": 500000, + "num_train_epochs": 11, + "total_flos": 8.484077462446146e+18, + "trial_name": null, + "trial_params": null +}