{ "config": { "epochs": 10, "lr": 0.0003, "warmup_steps": 300, "grad_accum_steps": 8, "grad_clip": 1.0 }, "train": [ { "epoch": 1, "avg_loss": 7.070409, "lr": 0.00027, "step": 270, "time_s": 971.0 }, { "epoch": 2, "avg_loss": 6.138723, "lr": 0.0002986039232296601, "step": 540, "time_s": 969.5 }, { "epoch": 3, "avg_loss": 5.511719, "lr": 0.00029373201732051774, "step": 810, "time_s": 970.5 }, { "epoch": 4, "avg_loss": 5.033876, "lr": 0.00028548319801432657, "step": 1080, "time_s": 966.4 }, { "epoch": 5, "avg_loss": 4.711938, "lr": 0.00027406193668993577, "step": 1350, "time_s": 977.1 }, { "epoch": 6, "avg_loss": 4.464801, "lr": 0.00027245653264858043, "step": 1620, "time_s": 973.7 }, { "epoch": 7, "avg_loss": 4.292098, "lr": 0.0002676801152109725, "step": 1890, "time_s": 977.7 }, { "epoch": 8, "avg_loss": 4.150779, "lr": 0.00025985108200750904, "step": 2160, "time_s": 975.7 }, { "epoch": 9, "avg_loss": 4.048912, "lr": 0.0002595061312929626, "step": 2486, "time_s": 1655.5 }, { "epoch": 10, "avg_loss": 3.914366, "lr": 0.00025878265404557577, "step": 2809, "time_s": 1653.9 }, { "epoch": 11, "avg_loss": 3.793121, "lr": 0.00025758003153357115, "step": 3132, "time_s": 1677.3 }, { "epoch": 12, "avg_loss": 3.684527, "lr": 0.00025590300995857436, "step": 3455, "time_s": 1674.5 }, { "epoch": 13, "avg_loss": 3.584779, "lr": 0.0002557817233080656, "step": 3778, "time_s": 1717.0 }, { "epoch": 14, "avg_loss": 3.478163, "lr": 0.0002554181076117055, "step": 4101, "time_s": 1719.1 }, { "epoch": 15, "avg_loss": 3.377975, "lr": 0.0002548128951431141, "step": 4424, "time_s": 1718.5 }, { "epoch": 16, "avg_loss": 3.283389, "lr": 0.00025396730471965995, "step": 4747, "time_s": 1719.1 }, { "epoch": 17, "avg_loss": 3.198379, "lr": 0.0002528830392479232, "step": 5070, "time_s": 1718.4 }, { "epoch": 18, "avg_loss": 3.123276, "lr": 0.0002515622822942667, "step": 5393, "time_s": 1716.5 }, { "epoch": 19, "avg_loss": 3.054019, "lr": 0.00025000769368742505, "step": 5716, "time_s": 1714.5 }, { "epoch": 20, "avg_loss": 2.990954, "lr": 0.00024822240416196397, "step": 6039, "time_s": 1717.5 }, { "epoch": 21, "avg_loss": 2.932507, "lr": 0.0002462100090533974, "step": 6362, "time_s": 1720.0 }, { "epoch": 22, "avg_loss": 2.879704, "lr": 0.0002439745610576592, "step": 6685, "time_s": 1717.7 }, { "epoch": 23, "avg_loss": 2.830638, "lr": 0.00024152056206951165, "step": 7008, "time_s": 1716.3 }, { "epoch": 24, "avg_loss": 2.785015, "lr": 0.00023885295411633396, "step": 7331, "time_s": 1718.6 }, { "epoch": 25, "avg_loss": 2.743108, "lr": 0.00023597710940553054, "step": 7654, "time_s": 1719.6 }, { "epoch": 26, "avg_loss": 2.72249, "lr": 0.00023586585478300336, "step": 7977, "time_s": 1699.7 }, { "epoch": 27, "avg_loss": 2.681254, "lr": 0.00023553231496741537, "step": 8300, "time_s": 1697.8 }, { "epoch": 28, "avg_loss": 2.644416, "lr": 0.0002349771616635675, "step": 8623, "time_s": 1702.3 }, { "epoch": 29, "avg_loss": 2.610499, "lr": 0.00023420151287633192, "step": 8946, "time_s": 1700.0 }, { "epoch": 30, "avg_loss": 2.578076, "lr": 0.00023320693065914135, "step": 9269, "time_s": 1706.3 }, { "epoch": 31, "avg_loss": 2.547269, "lr": 0.00023199541796821904, "step": 9592, "time_s": 1707.6 }, { "epoch": 32, "avg_loss": 2.51851, "lr": 0.00023056941462889428, "step": 9915, "time_s": 1709.3 }, { "epoch": 33, "avg_loss": 2.49159, "lr": 0.00022893179242211494, "step": 10238, "time_s": 1707.1 }, { "epoch": 34, "avg_loss": 2.467141, "lr": 0.00022708584930106635, "step": 10561, "time_s": 1703.1 }, { "epoch": 35, "avg_loss": 2.440859, "lr": 0.00022503530274953248, "step": 10884, "time_s": 1703.3 }, { "epoch": 36, "avg_loss": 2.439114, "lr": 0.00022498348287805102, "step": 11207, "time_s": 1661.0 }, { "epoch": 37, "avg_loss": 2.412668, "lr": 0.00022482807440355346, "step": 11530, "time_s": 1677.7 }, { "epoch": 38, "avg_loss": 2.390983, "lr": 0.0002245692306954427, "step": 11853, "time_s": 1671.4 }, { "epoch": 39, "avg_loss": 2.368829, "lr": 0.00022420720720120794, "step": 12176, "time_s": 1673.6 }, { "epoch": 40, "avg_loss": 2.349286, "lr": 0.00022374236119432995, "step": 12499, "time_s": 1675.2 }, { "epoch": 41, "avg_loss": 2.329778, "lr": 0.00022317515142169614, "step": 12822, "time_s": 1669.3 }, { "epoch": 42, "avg_loss": 2.312137, "lr": 0.00022250613765087169, "step": 13145, "time_s": 1665.3 }, { "epoch": 43, "avg_loss": 2.226545, "lr": 0.00011089582625474084, "step": 13468, "time_s": 1661.9 }, { "epoch": 44, "avg_loss": 2.203883, "lr": 0.0001104920200486558, "step": 13791, "time_s": 1676.9 }, { "epoch": 45, "avg_loss": 2.190213, "lr": 0.00011004204871515414, "step": 14114, "time_s": 1663.6 }, { "epoch": 46, "avg_loss": 2.178578, "lr": 0.00010954635632161652, "step": 14437, "time_s": 1673.2 }, { "epoch": 47, "avg_loss": 2.167334, "lr": 0.00010900543205658946, "step": 14760, "time_s": 1661.2 }, { "epoch": 48, "avg_loss": 2.157297, "lr": 0.00010841980974701614, "step": 15083, "time_s": 1661.9 }, { "epoch": 49, "avg_loss": 2.14751, "lr": 0.00010779006733141285, "step": 15406, "time_s": 1662.9 }, { "epoch": 50, "avg_loss": 2.138343, "lr": 0.00010711682628951417, "step": 15729, "time_s": 1661.5 } ], "val": [ { "epoch": 1, "avg_loss": 6.459693 }, { "epoch": 2, "avg_loss": 5.804065 }, { "epoch": 3, "avg_loss": 5.179338 }, { "epoch": 4, "avg_loss": 4.774045 }, { "epoch": 5, "avg_loss": 4.511194 }, { "epoch": 6, "avg_loss": 4.305547 }, { "epoch": 7, "avg_loss": 4.154681 }, { "epoch": 8, "avg_loss": 4.037178 }, { "epoch": 9, "avg_loss": 3.829285 }, { "epoch": 10, "avg_loss": 3.708552 }, { "epoch": 11, "avg_loss": 3.594035 }, { "epoch": 12, "avg_loss": 3.495301 }, { "epoch": 13, "avg_loss": 3.318829 }, { "epoch": 14, "avg_loss": 3.237618 }, { "epoch": 15, "avg_loss": 3.171485 }, { "epoch": 16, "avg_loss": 3.082833 }, { "epoch": 17, "avg_loss": 3.013669 }, { "epoch": 18, "avg_loss": 2.949275 }, { "epoch": 19, "avg_loss": 2.898899 }, { "epoch": 20, "avg_loss": 2.855805 }, { "epoch": 21, "avg_loss": 2.804574 }, { "epoch": 22, "avg_loss": 2.76583 }, { "epoch": 23, "avg_loss": 2.720584 }, { "epoch": 24, "avg_loss": 2.692678 }, { "epoch": 25, "avg_loss": 2.660146 }, { "epoch": 26, "avg_loss": 2.477178 }, { "epoch": 27, "avg_loss": 2.463243 }, { "epoch": 28, "avg_loss": 2.459879 }, { "epoch": 29, "avg_loss": 2.455403 }, { "epoch": 30, "avg_loss": 2.439061 }, { "epoch": 31, "avg_loss": 2.430927 }, { "epoch": 32, "avg_loss": 2.414427 }, { "epoch": 33, "avg_loss": 2.401476 }, { "epoch": 34, "avg_loss": 2.404088 }, { "epoch": 35, "avg_loss": 2.382748 }, { "epoch": 36, "avg_loss": 2.18284 }, { "epoch": 37, "avg_loss": 2.201308 }, { "epoch": 38, "avg_loss": 2.214867 }, { "epoch": 39, "avg_loss": 2.223911 }, { "epoch": 40, "avg_loss": 2.225972 }, { "epoch": 41, "avg_loss": 2.224468 }, { "epoch": 42, "avg_loss": 2.238013 }, { "epoch": 43, "avg_loss": 2.184722 }, { "epoch": 44, "avg_loss": 2.177716 }, { "epoch": 45, "avg_loss": 2.176141 }, { "epoch": 46, "avg_loss": 2.170592 }, { "epoch": 47, "avg_loss": 2.173157 }, { "epoch": 48, "avg_loss": 2.173247 }, { "epoch": 49, "avg_loss": 2.175019 }, { "epoch": 50, "avg_loss": 2.18001 } ], "sanity": [] }