| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.78184172591561, |
| "eval_steps": 500, |
| "global_step": 16000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0004886510786972562, |
| "grad_norm": 550.610107421875, |
| "learning_rate": 4.396678065461651e-08, |
| "loss": 8.0618, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0009773021573945123, |
| "grad_norm": 547.8289794921875, |
| "learning_rate": 9.281875915974597e-08, |
| "loss": 8.0357, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0014659532360917686, |
| "grad_norm": 534.9840087890625, |
| "learning_rate": 1.4167073766487544e-07, |
| "loss": 7.8875, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0019546043147890247, |
| "grad_norm": 550.1026000976562, |
| "learning_rate": 1.905227161700049e-07, |
| "loss": 7.5113, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.002443255393486281, |
| "grad_norm": 519.316650390625, |
| "learning_rate": 2.3937469467513437e-07, |
| "loss": 6.2256, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0029319064721835372, |
| "grad_norm": 198.8031768798828, |
| "learning_rate": 2.8822667318026384e-07, |
| "loss": 4.6185, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0034205575508807935, |
| "grad_norm": 113.63694763183594, |
| "learning_rate": 3.3707865168539325e-07, |
| "loss": 2.404, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.003909208629578049, |
| "grad_norm": 77.9081802368164, |
| "learning_rate": 3.859306301905227e-07, |
| "loss": 1.9127, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.004397859708275306, |
| "grad_norm": 92.41219329833984, |
| "learning_rate": 4.347826086956522e-07, |
| "loss": 1.5194, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.004886510786972562, |
| "grad_norm": 20.305315017700195, |
| "learning_rate": 4.836345872007817e-07, |
| "loss": 1.3619, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.005375161865669819, |
| "grad_norm": 15.185184478759766, |
| "learning_rate": 5.324865657059111e-07, |
| "loss": 1.2711, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0058638129443670745, |
| "grad_norm": 17.12596321105957, |
| "learning_rate": 5.813385442110406e-07, |
| "loss": 1.2171, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.006352464023064331, |
| "grad_norm": 24.50565528869629, |
| "learning_rate": 6.3019052271617e-07, |
| "loss": 1.1917, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.006841115101761587, |
| "grad_norm": 9.890480995178223, |
| "learning_rate": 6.790425012212995e-07, |
| "loss": 1.1793, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.007329766180458844, |
| "grad_norm": 16.53375816345215, |
| "learning_rate": 7.278944797264289e-07, |
| "loss": 1.1749, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.007818417259156099, |
| "grad_norm": 19.722103118896484, |
| "learning_rate": 7.767464582315585e-07, |
| "loss": 1.1696, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.008307068337853355, |
| "grad_norm": 21.687253952026367, |
| "learning_rate": 8.255984367366879e-07, |
| "loss": 1.1658, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.008795719416550612, |
| "grad_norm": 20.212892532348633, |
| "learning_rate": 8.744504152418174e-07, |
| "loss": 1.1672, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.009284370495247869, |
| "grad_norm": 14.680685043334961, |
| "learning_rate": 9.233023937469468e-07, |
| "loss": 1.164, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.009773021573945124, |
| "grad_norm": 15.129215240478516, |
| "learning_rate": 9.721543722520762e-07, |
| "loss": 1.1606, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.01026167265264238, |
| "grad_norm": 6.895666122436523, |
| "learning_rate": 1.0210063507572057e-06, |
| "loss": 1.161, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.010750323731339637, |
| "grad_norm": 6.139767646789551, |
| "learning_rate": 1.0698583292623353e-06, |
| "loss": 1.1602, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.011238974810036894, |
| "grad_norm": 25.940549850463867, |
| "learning_rate": 1.1187103077674646e-06, |
| "loss": 1.158, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.011727625888734149, |
| "grad_norm": 5.9631829261779785, |
| "learning_rate": 1.167562286272594e-06, |
| "loss": 1.1565, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.012216276967431406, |
| "grad_norm": 42.288856506347656, |
| "learning_rate": 1.2164142647777236e-06, |
| "loss": 1.1634, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.012704928046128662, |
| "grad_norm": 23.973031997680664, |
| "learning_rate": 1.265266243282853e-06, |
| "loss": 1.1622, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.013193579124825917, |
| "grad_norm": 32.71512985229492, |
| "learning_rate": 1.3141182217879824e-06, |
| "loss": 1.1632, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.013682230203523174, |
| "grad_norm": 45.931095123291016, |
| "learning_rate": 1.362970200293112e-06, |
| "loss": 1.1611, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.01417088128222043, |
| "grad_norm": 31.298593521118164, |
| "learning_rate": 1.4118221787982415e-06, |
| "loss": 1.1609, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.014659532360917688, |
| "grad_norm": 37.475921630859375, |
| "learning_rate": 1.4606741573033708e-06, |
| "loss": 1.159, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.015148183439614942, |
| "grad_norm": 56.90618896484375, |
| "learning_rate": 1.5095261358085003e-06, |
| "loss": 1.1611, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.015636834518312197, |
| "grad_norm": 26.84503746032715, |
| "learning_rate": 1.5583781143136298e-06, |
| "loss": 1.1605, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.016125485597009454, |
| "grad_norm": 31.706214904785156, |
| "learning_rate": 1.6072300928187593e-06, |
| "loss": 1.1636, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.01661413667570671, |
| "grad_norm": 20.083066940307617, |
| "learning_rate": 1.6560820713238887e-06, |
| "loss": 1.155, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.017102787754403968, |
| "grad_norm": 20.37936782836914, |
| "learning_rate": 1.7049340498290182e-06, |
| "loss": 1.1544, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.017591438833101224, |
| "grad_norm": 29.238149642944336, |
| "learning_rate": 1.7537860283341477e-06, |
| "loss": 1.1526, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.01808008991179848, |
| "grad_norm": 24.459911346435547, |
| "learning_rate": 1.802638006839277e-06, |
| "loss": 1.1542, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.018568740990495738, |
| "grad_norm": 25.11469841003418, |
| "learning_rate": 1.8514899853444065e-06, |
| "loss": 1.153, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.019057392069192994, |
| "grad_norm": 19.211380004882812, |
| "learning_rate": 1.900341963849536e-06, |
| "loss": 1.1524, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.019546043147890248, |
| "grad_norm": 29.28157615661621, |
| "learning_rate": 1.9491939423546656e-06, |
| "loss": 1.1507, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.020034694226587504, |
| "grad_norm": 18.40865707397461, |
| "learning_rate": 1.998045920859795e-06, |
| "loss": 1.1514, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.02052334530528476, |
| "grad_norm": 32.68934631347656, |
| "learning_rate": 2.046897899364924e-06, |
| "loss": 1.1509, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.021011996383982018, |
| "grad_norm": 28.276269912719727, |
| "learning_rate": 2.0957498778700537e-06, |
| "loss": 1.1508, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.021500647462679275, |
| "grad_norm": 29.66724967956543, |
| "learning_rate": 2.1446018563751832e-06, |
| "loss": 1.1507, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.02198929854137653, |
| "grad_norm": 33.39693069458008, |
| "learning_rate": 2.1934538348803127e-06, |
| "loss": 1.1505, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.022477949620073788, |
| "grad_norm": 28.482940673828125, |
| "learning_rate": 2.2423058133854423e-06, |
| "loss": 1.1488, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.02296660069877104, |
| "grad_norm": 22.53483009338379, |
| "learning_rate": 2.2911577918905718e-06, |
| "loss": 1.1495, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.023455251777468298, |
| "grad_norm": 20.745651245117188, |
| "learning_rate": 2.3400097703957013e-06, |
| "loss": 1.1477, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.023943902856165555, |
| "grad_norm": 27.499927520751953, |
| "learning_rate": 2.388861748900831e-06, |
| "loss": 1.1491, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.02443255393486281, |
| "grad_norm": 17.32890510559082, |
| "learning_rate": 2.43771372740596e-06, |
| "loss": 1.1483, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.02443255393486281, |
| "eval_loss": 1.1245596408843994, |
| "eval_runtime": 728.0762, |
| "eval_samples_per_second": 242.98, |
| "eval_steps_per_second": 0.475, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.024921205013560068, |
| "grad_norm": 28.78767967224121, |
| "learning_rate": 2.4865657059110894e-06, |
| "loss": 1.1466, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.025409856092257325, |
| "grad_norm": 19.717103958129883, |
| "learning_rate": 2.5354176844162194e-06, |
| "loss": 1.1464, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.02589850717095458, |
| "grad_norm": 27.598007202148438, |
| "learning_rate": 2.584269662921349e-06, |
| "loss": 1.1465, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.026387158249651835, |
| "grad_norm": 23.63872528076172, |
| "learning_rate": 2.633121641426478e-06, |
| "loss": 1.1453, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.02687580932834909, |
| "grad_norm": 26.8532772064209, |
| "learning_rate": 2.6819736199316075e-06, |
| "loss": 1.1466, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.027364460407046348, |
| "grad_norm": 22.594478607177734, |
| "learning_rate": 2.730825598436737e-06, |
| "loss": 1.146, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.027853111485743605, |
| "grad_norm": 22.817705154418945, |
| "learning_rate": 2.7796775769418666e-06, |
| "loss": 1.1437, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.02834176256444086, |
| "grad_norm": 7.5399250984191895, |
| "learning_rate": 2.828529555446996e-06, |
| "loss": 1.1443, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.02883041364313812, |
| "grad_norm": 34.241981506347656, |
| "learning_rate": 2.8773815339521256e-06, |
| "loss": 1.1723, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.029319064721835375, |
| "grad_norm": 41.769996643066406, |
| "learning_rate": 2.926233512457255e-06, |
| "loss": 1.1566, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.02980771580053263, |
| "grad_norm": 60.579036712646484, |
| "learning_rate": 2.9750854909623842e-06, |
| "loss": 1.1581, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.030296366879229885, |
| "grad_norm": 9.520317077636719, |
| "learning_rate": 3.0239374694675137e-06, |
| "loss": 1.1521, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.03078501795792714, |
| "grad_norm": 48.57497787475586, |
| "learning_rate": 3.0727894479726433e-06, |
| "loss": 1.1721, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.031273669036624395, |
| "grad_norm": 44.00090026855469, |
| "learning_rate": 3.1216414264777728e-06, |
| "loss": 1.1517, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.03176232011532165, |
| "grad_norm": 25.236425399780273, |
| "learning_rate": 3.1704934049829023e-06, |
| "loss": 1.1657, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.03225097119401891, |
| "grad_norm": 26.25858497619629, |
| "learning_rate": 3.219345383488032e-06, |
| "loss": 1.146, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.032739622272716165, |
| "grad_norm": 33.179771423339844, |
| "learning_rate": 3.2681973619931613e-06, |
| "loss": 1.1452, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.03322827335141342, |
| "grad_norm": 26.868507385253906, |
| "learning_rate": 3.3170493404982904e-06, |
| "loss": 1.1399, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.03371692443011068, |
| "grad_norm": 14.93895149230957, |
| "learning_rate": 3.36590131900342e-06, |
| "loss": 1.142, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.034205575508807935, |
| "grad_norm": 10.044151306152344, |
| "learning_rate": 3.4147532975085495e-06, |
| "loss": 1.1391, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.03469422658750519, |
| "grad_norm": 22.18167495727539, |
| "learning_rate": 3.463605276013679e-06, |
| "loss": 1.1439, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.03518287766620245, |
| "grad_norm": 17.276782989501953, |
| "learning_rate": 3.5124572545188085e-06, |
| "loss": 1.1402, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.035671528744899705, |
| "grad_norm": 22.945816040039062, |
| "learning_rate": 3.561309233023938e-06, |
| "loss": 1.1409, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.03616017982359696, |
| "grad_norm": 28.2205810546875, |
| "learning_rate": 3.6101612115290676e-06, |
| "loss": 1.1425, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.03664883090229422, |
| "grad_norm": 29.494741439819336, |
| "learning_rate": 3.6590131900341966e-06, |
| "loss": 1.1423, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.037137481980991476, |
| "grad_norm": 12.51252269744873, |
| "learning_rate": 3.707865168539326e-06, |
| "loss": 1.14, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.03762613305968873, |
| "grad_norm": 19.594589233398438, |
| "learning_rate": 3.7567171470444557e-06, |
| "loss": 1.1472, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.03811478413838599, |
| "grad_norm": 8.805150032043457, |
| "learning_rate": 3.805569125549585e-06, |
| "loss": 1.143, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.03860343521708324, |
| "grad_norm": 16.11551856994629, |
| "learning_rate": 3.854421104054714e-06, |
| "loss": 1.1394, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.039092086295780495, |
| "grad_norm": 30.43114471435547, |
| "learning_rate": 3.903273082559844e-06, |
| "loss": 1.1412, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.03958073737447775, |
| "grad_norm": 24.658550262451172, |
| "learning_rate": 3.952125061064973e-06, |
| "loss": 1.1379, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.04006938845317501, |
| "grad_norm": 30.698740005493164, |
| "learning_rate": 4.000977039570103e-06, |
| "loss": 1.1402, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.040558039531872266, |
| "grad_norm": 15.285526275634766, |
| "learning_rate": 4.049829018075232e-06, |
| "loss": 1.1397, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.04104669061056952, |
| "grad_norm": 16.959575653076172, |
| "learning_rate": 4.098680996580362e-06, |
| "loss": 1.143, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.04153534168926678, |
| "grad_norm": 9.172962188720703, |
| "learning_rate": 4.1475329750854914e-06, |
| "loss": 1.1394, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.042023992767964036, |
| "grad_norm": 34.94649887084961, |
| "learning_rate": 4.196384953590621e-06, |
| "loss": 1.1586, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.04251264384666129, |
| "grad_norm": 31.494056701660156, |
| "learning_rate": 4.2452369320957505e-06, |
| "loss": 1.1561, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.04300129492535855, |
| "grad_norm": 30.298629760742188, |
| "learning_rate": 4.29408891060088e-06, |
| "loss": 1.1579, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.043489946004055806, |
| "grad_norm": 8.591865539550781, |
| "learning_rate": 4.3429408891060095e-06, |
| "loss": 1.1441, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.04397859708275306, |
| "grad_norm": 22.960233688354492, |
| "learning_rate": 4.391792867611139e-06, |
| "loss": 1.1444, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.04446724816145032, |
| "grad_norm": 14.041954040527344, |
| "learning_rate": 4.4406448461162685e-06, |
| "loss": 1.141, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.044955899240147576, |
| "grad_norm": 25.523542404174805, |
| "learning_rate": 4.489496824621398e-06, |
| "loss": 1.1335, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.045444550318844826, |
| "grad_norm": 12.317065238952637, |
| "learning_rate": 4.538348803126527e-06, |
| "loss": 1.1412, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.04593320139754208, |
| "grad_norm": 6.889744758605957, |
| "learning_rate": 4.587200781631656e-06, |
| "loss": 1.1431, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.04642185247623934, |
| "grad_norm": 14.626124382019043, |
| "learning_rate": 4.636052760136786e-06, |
| "loss": 1.1369, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.046910503554936596, |
| "grad_norm": 8.889772415161133, |
| "learning_rate": 4.684904738641915e-06, |
| "loss": 1.1327, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.04739915463363385, |
| "grad_norm": 22.604360580444336, |
| "learning_rate": 4.733756717147045e-06, |
| "loss": 1.1311, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.04788780571233111, |
| "grad_norm": 18.373239517211914, |
| "learning_rate": 4.782608695652174e-06, |
| "loss": 1.1329, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.048376456791028366, |
| "grad_norm": 18.741851806640625, |
| "learning_rate": 4.831460674157304e-06, |
| "loss": 1.1275, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.04886510786972562, |
| "grad_norm": 21.531051635742188, |
| "learning_rate": 4.880312652662433e-06, |
| "loss": 1.1242, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.04886510786972562, |
| "eval_loss": 1.0948448181152344, |
| "eval_runtime": 728.3165, |
| "eval_samples_per_second": 242.9, |
| "eval_steps_per_second": 0.475, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.04935375894842288, |
| "grad_norm": 9.384544372558594, |
| "learning_rate": 4.929164631167563e-06, |
| "loss": 1.1484, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.049842410027120136, |
| "grad_norm": 25.287551879882812, |
| "learning_rate": 4.978016609672692e-06, |
| "loss": 1.1365, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.05033106110581739, |
| "grad_norm": 25.104299545288086, |
| "learning_rate": 5.026868588177821e-06, |
| "loss": 1.1073, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.05081971218451465, |
| "grad_norm": 5.135197639465332, |
| "learning_rate": 5.0757205666829515e-06, |
| "loss": 1.0908, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.051308363263211906, |
| "grad_norm": 10.835426330566406, |
| "learning_rate": 5.12457254518808e-06, |
| "loss": 1.0344, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.05179701434190916, |
| "grad_norm": 17.45260238647461, |
| "learning_rate": 5.1734245236932105e-06, |
| "loss": 0.9916, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.05228566542060641, |
| "grad_norm": 18.409074783325195, |
| "learning_rate": 5.222276502198339e-06, |
| "loss": 0.9616, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.05277431649930367, |
| "grad_norm": 13.753133773803711, |
| "learning_rate": 5.271128480703469e-06, |
| "loss": 0.9379, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.053262967578000926, |
| "grad_norm": 16.086511611938477, |
| "learning_rate": 5.319980459208598e-06, |
| "loss": 0.922, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.05375161865669818, |
| "grad_norm": 14.6001558303833, |
| "learning_rate": 5.368832437713728e-06, |
| "loss": 0.9061, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.05424026973539544, |
| "grad_norm": 22.474435806274414, |
| "learning_rate": 5.417684416218857e-06, |
| "loss": 0.9022, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.054728920814092696, |
| "grad_norm": 22.234281539916992, |
| "learning_rate": 5.466536394723987e-06, |
| "loss": 0.9091, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.05521757189278995, |
| "grad_norm": 10.945754051208496, |
| "learning_rate": 5.5153883732291154e-06, |
| "loss": 0.903, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.05570622297148721, |
| "grad_norm": 12.38178539276123, |
| "learning_rate": 5.564240351734246e-06, |
| "loss": 0.882, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.056194874050184467, |
| "grad_norm": 18.168428421020508, |
| "learning_rate": 5.6130923302393745e-06, |
| "loss": 0.8671, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.05668352512888172, |
| "grad_norm": 13.480072975158691, |
| "learning_rate": 5.661944308744505e-06, |
| "loss": 0.8604, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.05717217620757898, |
| "grad_norm": 15.529900550842285, |
| "learning_rate": 5.7107962872496335e-06, |
| "loss": 0.8467, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.05766082728627624, |
| "grad_norm": 12.60476016998291, |
| "learning_rate": 5.759648265754764e-06, |
| "loss": 0.8376, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.05814947836497349, |
| "grad_norm": 12.737000465393066, |
| "learning_rate": 5.8085002442598926e-06, |
| "loss": 0.8262, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.05863812944367075, |
| "grad_norm": 11.14971923828125, |
| "learning_rate": 5.857352222765023e-06, |
| "loss": 0.8164, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.059126780522368, |
| "grad_norm": 13.185476303100586, |
| "learning_rate": 5.906204201270152e-06, |
| "loss": 0.8107, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.05961543160106526, |
| "grad_norm": 19.2025203704834, |
| "learning_rate": 5.955056179775281e-06, |
| "loss": 0.8026, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.06010408267976251, |
| "grad_norm": 15.930268287658691, |
| "learning_rate": 6.003908158280411e-06, |
| "loss": 0.8046, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.06059273375845977, |
| "grad_norm": 9.219900131225586, |
| "learning_rate": 6.05276013678554e-06, |
| "loss": 0.7913, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.06108138483715703, |
| "grad_norm": 9.282882690429688, |
| "learning_rate": 6.10161211529067e-06, |
| "loss": 0.7798, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.06157003591585428, |
| "grad_norm": 10.684017181396484, |
| "learning_rate": 6.150464093795799e-06, |
| "loss": 0.7746, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.06205868699455154, |
| "grad_norm": 19.030454635620117, |
| "learning_rate": 6.199316072300928e-06, |
| "loss": 0.7789, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.06254733807324879, |
| "grad_norm": 14.472164154052734, |
| "learning_rate": 6.248168050806058e-06, |
| "loss": 0.7645, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.06303598915194605, |
| "grad_norm": 15.92104721069336, |
| "learning_rate": 6.297020029311187e-06, |
| "loss": 0.7601, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.0635246402306433, |
| "grad_norm": 12.93683910369873, |
| "learning_rate": 6.345872007816317e-06, |
| "loss": 0.7508, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.06401329130934057, |
| "grad_norm": 12.283439636230469, |
| "learning_rate": 6.394723986321446e-06, |
| "loss": 0.7441, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.06450194238803782, |
| "grad_norm": 11.30448055267334, |
| "learning_rate": 6.443575964826576e-06, |
| "loss": 0.7359, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.06499059346673508, |
| "grad_norm": 10.10558795928955, |
| "learning_rate": 6.492427943331705e-06, |
| "loss": 0.7312, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.06547924454543233, |
| "grad_norm": 10.84056282043457, |
| "learning_rate": 6.541279921836835e-06, |
| "loss": 0.7257, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.0659678956241296, |
| "grad_norm": 11.601236343383789, |
| "learning_rate": 6.590131900341964e-06, |
| "loss": 0.7205, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.06645654670282684, |
| "grad_norm": 9.640713691711426, |
| "learning_rate": 6.6389838788470936e-06, |
| "loss": 0.7179, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.06694519778152411, |
| "grad_norm": 7.962968826293945, |
| "learning_rate": 6.687835857352223e-06, |
| "loss": 0.7218, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.06743384886022136, |
| "grad_norm": 6.22469425201416, |
| "learning_rate": 6.736687835857353e-06, |
| "loss": 0.7158, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.06792249993891862, |
| "grad_norm": 11.041025161743164, |
| "learning_rate": 6.785539814362482e-06, |
| "loss": 0.7075, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.06841115101761587, |
| "grad_norm": 11.427011489868164, |
| "learning_rate": 6.834391792867612e-06, |
| "loss": 0.7097, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.06889980209631313, |
| "grad_norm": 5.472978115081787, |
| "learning_rate": 6.88324377137274e-06, |
| "loss": 0.7058, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.06938845317501038, |
| "grad_norm": 11.191500663757324, |
| "learning_rate": 6.932095749877871e-06, |
| "loss": 0.7033, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.06987710425370763, |
| "grad_norm": 9.202252388000488, |
| "learning_rate": 6.980947728382999e-06, |
| "loss": 0.7019, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.0703657553324049, |
| "grad_norm": 5.8239216804504395, |
| "learning_rate": 7.02979970688813e-06, |
| "loss": 0.6908, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.07085440641110215, |
| "grad_norm": 6.5890092849731445, |
| "learning_rate": 7.078651685393258e-06, |
| "loss": 0.6821, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.07134305748979941, |
| "grad_norm": 5.046870231628418, |
| "learning_rate": 7.127503663898389e-06, |
| "loss": 0.6782, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.07183170856849666, |
| "grad_norm": 6.238111972808838, |
| "learning_rate": 7.1763556424035174e-06, |
| "loss": 0.6597, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.07232035964719392, |
| "grad_norm": 14.37743091583252, |
| "learning_rate": 7.225207620908648e-06, |
| "loss": 0.6472, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.07280901072589117, |
| "grad_norm": 8.147233963012695, |
| "learning_rate": 7.2740595994137765e-06, |
| "loss": 0.6435, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.07329766180458844, |
| "grad_norm": 10.538481712341309, |
| "learning_rate": 7.322911577918906e-06, |
| "loss": 0.6375, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.07329766180458844, |
| "eval_loss": 0.6336132884025574, |
| "eval_runtime": 729.8371, |
| "eval_samples_per_second": 242.394, |
| "eval_steps_per_second": 0.474, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.07378631288328569, |
| "grad_norm": 8.202781677246094, |
| "learning_rate": 7.3717635564240355e-06, |
| "loss": 0.627, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.07427496396198295, |
| "grad_norm": 9.16813850402832, |
| "learning_rate": 7.420615534929165e-06, |
| "loss": 0.6136, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.0747636150406802, |
| "grad_norm": 4.204853057861328, |
| "learning_rate": 7.4694675134342946e-06, |
| "loss": 0.6092, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.07525226611937746, |
| "grad_norm": 7.2187652587890625, |
| "learning_rate": 7.518319491939424e-06, |
| "loss": 0.619, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.07574091719807471, |
| "grad_norm": 6.75137996673584, |
| "learning_rate": 7.567171470444553e-06, |
| "loss": 0.6183, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.07622956827677198, |
| "grad_norm": 12.58353042602539, |
| "learning_rate": 7.616023448949683e-06, |
| "loss": 0.6053, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.07671821935546923, |
| "grad_norm": 5.846193313598633, |
| "learning_rate": 7.664875427454813e-06, |
| "loss": 0.607, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.07720687043416648, |
| "grad_norm": 7.444247722625732, |
| "learning_rate": 7.713727405959941e-06, |
| "loss": 0.5934, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.07769552151286374, |
| "grad_norm": 3.659825563430786, |
| "learning_rate": 7.762579384465072e-06, |
| "loss": 0.5938, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.07818417259156099, |
| "grad_norm": 6.078113079071045, |
| "learning_rate": 7.8114313629702e-06, |
| "loss": 0.5942, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.07867282367025825, |
| "grad_norm": 7.572592735290527, |
| "learning_rate": 7.86028334147533e-06, |
| "loss": 0.6032, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.0791614747489555, |
| "grad_norm": 6.511207103729248, |
| "learning_rate": 7.90913531998046e-06, |
| "loss": 0.5873, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.07965012582765277, |
| "grad_norm": 6.170757293701172, |
| "learning_rate": 7.957987298485588e-06, |
| "loss": 0.5804, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.08013877690635002, |
| "grad_norm": 14.552532196044922, |
| "learning_rate": 8.006839276990718e-06, |
| "loss": 0.5753, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.08062742798504728, |
| "grad_norm": 8.183059692382812, |
| "learning_rate": 8.055691255495847e-06, |
| "loss": 0.5739, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.08111607906374453, |
| "grad_norm": 4.893775463104248, |
| "learning_rate": 8.104543234000977e-06, |
| "loss": 0.5722, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.0816047301424418, |
| "grad_norm": 9.298670768737793, |
| "learning_rate": 8.153395212506106e-06, |
| "loss": 0.5696, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.08209338122113904, |
| "grad_norm": 5.700584888458252, |
| "learning_rate": 8.202247191011237e-06, |
| "loss": 0.568, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.08258203229983631, |
| "grad_norm": 14.690134048461914, |
| "learning_rate": 8.251099169516365e-06, |
| "loss": 0.5739, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.08307068337853356, |
| "grad_norm": 12.68682861328125, |
| "learning_rate": 8.299951148021496e-06, |
| "loss": 0.5801, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.08355933445723081, |
| "grad_norm": 8.979551315307617, |
| "learning_rate": 8.348803126526624e-06, |
| "loss": 0.5791, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.08404798553592807, |
| "grad_norm": 5.448888301849365, |
| "learning_rate": 8.397655105031755e-06, |
| "loss": 0.5657, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.08453663661462532, |
| "grad_norm": 8.006872177124023, |
| "learning_rate": 8.446507083536883e-06, |
| "loss": 0.5484, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.08502528769332258, |
| "grad_norm": 6.7078046798706055, |
| "learning_rate": 8.495359062042014e-06, |
| "loss": 0.5555, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.08551393877201983, |
| "grad_norm": 8.614073753356934, |
| "learning_rate": 8.544211040547142e-06, |
| "loss": 0.5606, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.0860025898507171, |
| "grad_norm": 4.551246643066406, |
| "learning_rate": 8.593063019052273e-06, |
| "loss": 0.5544, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.08649124092941435, |
| "grad_norm": 3.444021463394165, |
| "learning_rate": 8.641914997557401e-06, |
| "loss": 0.5411, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.08697989200811161, |
| "grad_norm": 17.660511016845703, |
| "learning_rate": 8.690766976062532e-06, |
| "loss": 0.5427, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.08746854308680886, |
| "grad_norm": 7.721867561340332, |
| "learning_rate": 8.73961895456766e-06, |
| "loss": 0.5526, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.08795719416550613, |
| "grad_norm": 3.451046943664551, |
| "learning_rate": 8.78847093307279e-06, |
| "loss": 0.5425, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.08844584524420337, |
| "grad_norm": 4.078919887542725, |
| "learning_rate": 8.83732291157792e-06, |
| "loss": 0.5543, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.08893449632290064, |
| "grad_norm": 4.645016193389893, |
| "learning_rate": 8.88617489008305e-06, |
| "loss": 0.5463, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.08942314740159789, |
| "grad_norm": 8.30947208404541, |
| "learning_rate": 8.935026868588178e-06, |
| "loss": 0.5452, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.08991179848029515, |
| "grad_norm": 5.685572147369385, |
| "learning_rate": 8.983878847093309e-06, |
| "loss": 0.5369, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.0904004495589924, |
| "grad_norm": 9.45528793334961, |
| "learning_rate": 9.032730825598438e-06, |
| "loss": 0.5299, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.09088910063768965, |
| "grad_norm": 10.99970817565918, |
| "learning_rate": 9.081582804103566e-06, |
| "loss": 0.5287, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.09137775171638692, |
| "grad_norm": 6.199814796447754, |
| "learning_rate": 9.130434782608697e-06, |
| "loss": 0.5397, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.09186640279508416, |
| "grad_norm": 5.611557483673096, |
| "learning_rate": 9.179286761113825e-06, |
| "loss": 0.5472, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.09235505387378143, |
| "grad_norm": 4.567397594451904, |
| "learning_rate": 9.228138739618956e-06, |
| "loss": 0.5353, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.09284370495247868, |
| "grad_norm": 17.8961238861084, |
| "learning_rate": 9.276990718124084e-06, |
| "loss": 0.5263, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.09333235603117594, |
| "grad_norm": 8.548867225646973, |
| "learning_rate": 9.325842696629213e-06, |
| "loss": 0.5301, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.09382100710987319, |
| "grad_norm": 5.053003787994385, |
| "learning_rate": 9.374694675134343e-06, |
| "loss": 0.5316, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.09430965818857046, |
| "grad_norm": 10.809515953063965, |
| "learning_rate": 9.423546653639472e-06, |
| "loss": 0.5255, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.0947983092672677, |
| "grad_norm": 4.784992218017578, |
| "learning_rate": 9.472398632144602e-06, |
| "loss": 0.5232, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.09528696034596497, |
| "grad_norm": 6.658916473388672, |
| "learning_rate": 9.521250610649731e-06, |
| "loss": 0.5225, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.09577561142466222, |
| "grad_norm": 5.238591194152832, |
| "learning_rate": 9.570102589154861e-06, |
| "loss": 0.52, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.09626426250335948, |
| "grad_norm": 6.568732261657715, |
| "learning_rate": 9.61895456765999e-06, |
| "loss": 0.5117, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.09675291358205673, |
| "grad_norm": 11.915630340576172, |
| "learning_rate": 9.66780654616512e-06, |
| "loss": 0.5113, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.09724156466075398, |
| "grad_norm": 3.4283180236816406, |
| "learning_rate": 9.716658524670249e-06, |
| "loss": 0.5206, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.09773021573945125, |
| "grad_norm": 7.299953937530518, |
| "learning_rate": 9.76551050317538e-06, |
| "loss": 0.5319, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.09773021573945125, |
| "eval_loss": 0.5161277055740356, |
| "eval_runtime": 728.8014, |
| "eval_samples_per_second": 242.738, |
| "eval_steps_per_second": 0.475, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.0982188668181485, |
| "grad_norm": 4.911329746246338, |
| "learning_rate": 9.814362481680508e-06, |
| "loss": 0.5179, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.09870751789684576, |
| "grad_norm": 3.644986152648926, |
| "learning_rate": 9.863214460185639e-06, |
| "loss": 0.5148, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.09919616897554301, |
| "grad_norm": 5.680597305297852, |
| "learning_rate": 9.912066438690767e-06, |
| "loss": 0.5119, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.09968482005424027, |
| "grad_norm": 6.847180366516113, |
| "learning_rate": 9.960918417195898e-06, |
| "loss": 0.5135, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.10017347113293752, |
| "grad_norm": 4.022679328918457, |
| "learning_rate": 9.999999709052384e-06, |
| "loss": 0.5307, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.10066212221163479, |
| "grad_norm": 8.008437156677246, |
| "learning_rate": 9.999989525889357e-06, |
| "loss": 0.5135, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.10115077329033204, |
| "grad_norm": 3.9152987003326416, |
| "learning_rate": 9.99996479537936e-06, |
| "loss": 0.5098, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.1016394243690293, |
| "grad_norm": 4.81342887878418, |
| "learning_rate": 9.999925517594343e-06, |
| "loss": 0.5229, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.10212807544772655, |
| "grad_norm": 4.663543224334717, |
| "learning_rate": 9.999871692648587e-06, |
| "loss": 0.5198, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.10261672652642381, |
| "grad_norm": 3.905458927154541, |
| "learning_rate": 9.999803320698692e-06, |
| "loss": 0.5074, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.10310537760512106, |
| "grad_norm": 7.694464206695557, |
| "learning_rate": 9.999720401943584e-06, |
| "loss": 0.503, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.10359402868381833, |
| "grad_norm": 4.2866668701171875, |
| "learning_rate": 9.999622936624515e-06, |
| "loss": 0.5052, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.10408267976251558, |
| "grad_norm": 7.022489070892334, |
| "learning_rate": 9.999510925025058e-06, |
| "loss": 0.5087, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.10457133084121283, |
| "grad_norm": 2.201606273651123, |
| "learning_rate": 9.999384367471108e-06, |
| "loss": 0.5051, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.10505998191991009, |
| "grad_norm": 4.468674659729004, |
| "learning_rate": 9.99924326433088e-06, |
| "loss": 0.5123, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.10554863299860734, |
| "grad_norm": 3.214961528778076, |
| "learning_rate": 9.999087616014909e-06, |
| "loss": 0.5045, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.1060372840773046, |
| "grad_norm": 8.839011192321777, |
| "learning_rate": 9.998917422976053e-06, |
| "loss": 0.5057, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.10652593515600185, |
| "grad_norm": 3.3649775981903076, |
| "learning_rate": 9.998732685709482e-06, |
| "loss": 0.5026, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.10701458623469912, |
| "grad_norm": 5.231264591217041, |
| "learning_rate": 9.998533404752686e-06, |
| "loss": 0.4967, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.10750323731339637, |
| "grad_norm": 10.444920539855957, |
| "learning_rate": 9.998319580685467e-06, |
| "loss": 0.4978, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.10799188839209363, |
| "grad_norm": 3.976793050765991, |
| "learning_rate": 9.998091214129943e-06, |
| "loss": 0.5012, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.10848053947079088, |
| "grad_norm": 4.761758327484131, |
| "learning_rate": 9.997848305750538e-06, |
| "loss": 0.4948, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.10896919054948814, |
| "grad_norm": 4.317152976989746, |
| "learning_rate": 9.997590856253988e-06, |
| "loss": 0.4991, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.10945784162818539, |
| "grad_norm": 3.9865562915802, |
| "learning_rate": 9.99731886638934e-06, |
| "loss": 0.4973, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.10994649270688266, |
| "grad_norm": 3.0519254207611084, |
| "learning_rate": 9.997032336947938e-06, |
| "loss": 0.4968, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.1104351437855799, |
| "grad_norm": 3.462034225463867, |
| "learning_rate": 9.996731268763434e-06, |
| "loss": 0.4908, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.11092379486427716, |
| "grad_norm": 4.285225868225098, |
| "learning_rate": 9.996415662711779e-06, |
| "loss": 0.4906, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.11141244594297442, |
| "grad_norm": 2.549806833267212, |
| "learning_rate": 9.996085519711218e-06, |
| "loss": 0.4934, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.11190109702167167, |
| "grad_norm": 6.287642478942871, |
| "learning_rate": 9.995740840722297e-06, |
| "loss": 0.4969, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.11238974810036893, |
| "grad_norm": 6.043119430541992, |
| "learning_rate": 9.99538162674785e-06, |
| "loss": 0.4959, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.11287839917906618, |
| "grad_norm": 3.221782922744751, |
| "learning_rate": 9.995007878833001e-06, |
| "loss": 0.4895, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.11336705025776345, |
| "grad_norm": 7.820531368255615, |
| "learning_rate": 9.994619598065162e-06, |
| "loss": 0.4921, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.1138557013364607, |
| "grad_norm": 1.8136892318725586, |
| "learning_rate": 9.994216785574024e-06, |
| "loss": 0.4893, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.11434435241515796, |
| "grad_norm": 2.453530788421631, |
| "learning_rate": 9.993799442531562e-06, |
| "loss": 0.4874, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.11483300349385521, |
| "grad_norm": 2.470960855484009, |
| "learning_rate": 9.993367570152024e-06, |
| "loss": 0.4876, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.11532165457255247, |
| "grad_norm": 5.889760971069336, |
| "learning_rate": 9.992921169691934e-06, |
| "loss": 0.485, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.11581030565124972, |
| "grad_norm": 1.6044597625732422, |
| "learning_rate": 9.992460242450081e-06, |
| "loss": 0.4857, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.11629895672994699, |
| "grad_norm": 3.4553425312042236, |
| "learning_rate": 9.991984789767521e-06, |
| "loss": 0.4894, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.11678760780864424, |
| "grad_norm": 4.581337928771973, |
| "learning_rate": 9.991494813027576e-06, |
| "loss": 0.4915, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.1172762588873415, |
| "grad_norm": 3.9853124618530273, |
| "learning_rate": 9.990990313655817e-06, |
| "loss": 0.4885, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.11776490996603875, |
| "grad_norm": 2.2269527912139893, |
| "learning_rate": 9.990471293120074e-06, |
| "loss": 0.4868, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.118253561044736, |
| "grad_norm": 5.388997554779053, |
| "learning_rate": 9.989937752930426e-06, |
| "loss": 0.4958, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.11874221212343326, |
| "grad_norm": 4.705722332000732, |
| "learning_rate": 9.989389694639194e-06, |
| "loss": 0.4916, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.11923086320213051, |
| "grad_norm": 3.4011592864990234, |
| "learning_rate": 9.988827119840937e-06, |
| "loss": 0.4879, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.11971951428082778, |
| "grad_norm": 4.242159366607666, |
| "learning_rate": 9.98825003017246e-06, |
| "loss": 0.4856, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.12020816535952503, |
| "grad_norm": 3.563094139099121, |
| "learning_rate": 9.987658427312785e-06, |
| "loss": 0.4838, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.12069681643822229, |
| "grad_norm": 3.6437556743621826, |
| "learning_rate": 9.987052312983168e-06, |
| "loss": 0.4803, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.12118546751691954, |
| "grad_norm": 7.271683216094971, |
| "learning_rate": 9.986431688947083e-06, |
| "loss": 0.4855, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.1216741185956168, |
| "grad_norm": 4.0858941078186035, |
| "learning_rate": 9.98579655701022e-06, |
| "loss": 0.4878, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.12216276967431405, |
| "grad_norm": 4.186237335205078, |
| "learning_rate": 9.985146919020483e-06, |
| "loss": 0.4849, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.12216276967431405, |
| "eval_loss": 0.46980607509613037, |
| "eval_runtime": 728.0838, |
| "eval_samples_per_second": 242.978, |
| "eval_steps_per_second": 0.475, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.12265142075301132, |
| "grad_norm": 4.360340595245361, |
| "learning_rate": 9.984482776867975e-06, |
| "loss": 0.4824, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.12314007183170857, |
| "grad_norm": 2.920182228088379, |
| "learning_rate": 9.983804132485003e-06, |
| "loss": 0.4813, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.12362872291040583, |
| "grad_norm": 2.6488723754882812, |
| "learning_rate": 9.983110987846063e-06, |
| "loss": 0.4811, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.12411737398910308, |
| "grad_norm": 2.2960548400878906, |
| "learning_rate": 9.982403344967847e-06, |
| "loss": 0.4755, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.12460602506780034, |
| "grad_norm": 2.8793044090270996, |
| "learning_rate": 9.98168120590922e-06, |
| "loss": 0.4792, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.12509467614649758, |
| "grad_norm": 2.4910120964050293, |
| "learning_rate": 9.980944572771231e-06, |
| "loss": 0.4839, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.12558332722519486, |
| "grad_norm": 6.9705891609191895, |
| "learning_rate": 9.980193447697095e-06, |
| "loss": 0.4792, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.1260719783038921, |
| "grad_norm": 2.401073694229126, |
| "learning_rate": 9.979427832872191e-06, |
| "loss": 0.4788, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.12656062938258936, |
| "grad_norm": 2.653182029724121, |
| "learning_rate": 9.97864773052406e-06, |
| "loss": 0.4804, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.1270492804612866, |
| "grad_norm": 2.8506484031677246, |
| "learning_rate": 9.977853142922386e-06, |
| "loss": 0.4769, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.12753793153998388, |
| "grad_norm": 3.2540268898010254, |
| "learning_rate": 9.977044072379006e-06, |
| "loss": 0.4797, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.12802658261868113, |
| "grad_norm": 6.425643444061279, |
| "learning_rate": 9.976220521247888e-06, |
| "loss": 0.4872, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.12851523369737838, |
| "grad_norm": 3.4844772815704346, |
| "learning_rate": 9.975382491925137e-06, |
| "loss": 0.4775, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.12900388477607563, |
| "grad_norm": 2.7126948833465576, |
| "learning_rate": 9.974529986848976e-06, |
| "loss": 0.4795, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.1294925358547729, |
| "grad_norm": 3.378321409225464, |
| "learning_rate": 9.973663008499748e-06, |
| "loss": 0.4851, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.12998118693347016, |
| "grad_norm": 2.3212387561798096, |
| "learning_rate": 9.972781559399906e-06, |
| "loss": 0.4765, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.1304698380121674, |
| "grad_norm": 3.0284295082092285, |
| "learning_rate": 9.971885642114006e-06, |
| "loss": 0.4779, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.13095848909086466, |
| "grad_norm": 2.1346194744110107, |
| "learning_rate": 9.970975259248696e-06, |
| "loss": 0.4765, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.13144714016956194, |
| "grad_norm": 2.0011963844299316, |
| "learning_rate": 9.97005041345271e-06, |
| "loss": 0.4813, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.1319357912482592, |
| "grad_norm": 3.866771936416626, |
| "learning_rate": 9.969111107416867e-06, |
| "loss": 0.4766, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.13242444232695644, |
| "grad_norm": 6.982947826385498, |
| "learning_rate": 9.968157343874056e-06, |
| "loss": 0.4773, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.1329130934056537, |
| "grad_norm": 4.293519973754883, |
| "learning_rate": 9.967189125599228e-06, |
| "loss": 0.4818, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.13340174448435094, |
| "grad_norm": 3.3985178470611572, |
| "learning_rate": 9.966206455409386e-06, |
| "loss": 0.4778, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.13389039556304821, |
| "grad_norm": 1.5569087266921997, |
| "learning_rate": 9.96520933616359e-06, |
| "loss": 0.4737, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.13437904664174546, |
| "grad_norm": 4.966946125030518, |
| "learning_rate": 9.964197770762933e-06, |
| "loss": 0.4762, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.13486769772044271, |
| "grad_norm": 2.4373340606689453, |
| "learning_rate": 9.96317176215054e-06, |
| "loss": 0.4764, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.13535634879913996, |
| "grad_norm": 4.127823352813721, |
| "learning_rate": 9.962131313311555e-06, |
| "loss": 0.4753, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.13584499987783724, |
| "grad_norm": 2.2819466590881348, |
| "learning_rate": 9.96107642727314e-06, |
| "loss": 0.475, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.1363336509565345, |
| "grad_norm": 5.689523696899414, |
| "learning_rate": 9.960007107104462e-06, |
| "loss": 0.4748, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.13682230203523174, |
| "grad_norm": 2.3338582515716553, |
| "learning_rate": 9.958923355916682e-06, |
| "loss": 0.4774, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.137310953113929, |
| "grad_norm": 5.458847522735596, |
| "learning_rate": 9.95782517686294e-06, |
| "loss": 0.474, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.13779960419262627, |
| "grad_norm": 1.634664535522461, |
| "learning_rate": 9.956712573138371e-06, |
| "loss": 0.4737, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.13828825527132352, |
| "grad_norm": 1.757805585861206, |
| "learning_rate": 9.955585547980065e-06, |
| "loss": 0.4713, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.13877690635002077, |
| "grad_norm": 1.5585452318191528, |
| "learning_rate": 9.954444104667071e-06, |
| "loss": 0.4734, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.13926555742871802, |
| "grad_norm": 8.348752975463867, |
| "learning_rate": 9.953288246520393e-06, |
| "loss": 0.4754, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.13975420850741527, |
| "grad_norm": 2.4966542720794678, |
| "learning_rate": 9.95211797690297e-06, |
| "loss": 0.4719, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.14024285958611254, |
| "grad_norm": 2.205169439315796, |
| "learning_rate": 9.950933299219676e-06, |
| "loss": 0.4705, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.1407315106648098, |
| "grad_norm": 2.2777152061462402, |
| "learning_rate": 9.949734216917301e-06, |
| "loss": 0.4687, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.14122016174350704, |
| "grad_norm": 1.1067817211151123, |
| "learning_rate": 9.948520733484543e-06, |
| "loss": 0.4673, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.1417088128222043, |
| "grad_norm": 3.3773841857910156, |
| "learning_rate": 9.947292852452003e-06, |
| "loss": 0.4707, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.14219746390090157, |
| "grad_norm": 1.1769728660583496, |
| "learning_rate": 9.946050577392173e-06, |
| "loss": 0.4703, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.14268611497959882, |
| "grad_norm": 7.464486122131348, |
| "learning_rate": 9.94479391191942e-06, |
| "loss": 0.4723, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.14317476605829607, |
| "grad_norm": 2.232747793197632, |
| "learning_rate": 9.94352285968998e-06, |
| "loss": 0.4735, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.14366341713699332, |
| "grad_norm": 3.54618239402771, |
| "learning_rate": 9.942237424401952e-06, |
| "loss": 0.4695, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.1441520682156906, |
| "grad_norm": 1.840293526649475, |
| "learning_rate": 9.940937609795276e-06, |
| "loss": 0.471, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.14464071929438785, |
| "grad_norm": 3.1132638454437256, |
| "learning_rate": 9.939623419651732e-06, |
| "loss": 0.47, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.1451293703730851, |
| "grad_norm": 1.120263695716858, |
| "learning_rate": 9.93829485779492e-06, |
| "loss": 0.47, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.14561802145178235, |
| "grad_norm": 4.9828057289123535, |
| "learning_rate": 9.936951928090266e-06, |
| "loss": 0.4731, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.1461066725304796, |
| "grad_norm": 2.0490236282348633, |
| "learning_rate": 9.935594634444985e-06, |
| "loss": 0.4707, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.14659532360917688, |
| "grad_norm": 3.313997268676758, |
| "learning_rate": 9.93422298080809e-06, |
| "loss": 0.4675, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.14659532360917688, |
| "eval_loss": 0.45171666145324707, |
| "eval_runtime": 727.5975, |
| "eval_samples_per_second": 243.14, |
| "eval_steps_per_second": 0.476, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.14708397468787412, |
| "grad_norm": 3.7955098152160645, |
| "learning_rate": 9.932836971170375e-06, |
| "loss": 0.4759, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.14757262576657137, |
| "grad_norm": 0.955259382724762, |
| "learning_rate": 9.931436609564402e-06, |
| "loss": 0.4676, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.14806127684526862, |
| "grad_norm": 1.6290405988693237, |
| "learning_rate": 9.930021900064486e-06, |
| "loss": 0.47, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.1485499279239659, |
| "grad_norm": 4.106773376464844, |
| "learning_rate": 9.928592846786693e-06, |
| "loss": 0.4693, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.14903857900266315, |
| "grad_norm": 1.7998560667037964, |
| "learning_rate": 9.927149453888814e-06, |
| "loss": 0.4679, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.1495272300813604, |
| "grad_norm": 3.9935462474823, |
| "learning_rate": 9.92569172557037e-06, |
| "loss": 0.4675, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.15001588116005765, |
| "grad_norm": 1.6421153545379639, |
| "learning_rate": 9.924219666072584e-06, |
| "loss": 0.469, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.15050453223875493, |
| "grad_norm": 6.065347671508789, |
| "learning_rate": 9.922733279678376e-06, |
| "loss": 0.478, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.15099318331745218, |
| "grad_norm": 4.049252986907959, |
| "learning_rate": 9.921232570712351e-06, |
| "loss": 0.4734, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.15148183439614943, |
| "grad_norm": 3.9484283924102783, |
| "learning_rate": 9.919717543540786e-06, |
| "loss": 0.4702, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.15197048547484668, |
| "grad_norm": 3.8022537231445312, |
| "learning_rate": 9.918188202571615e-06, |
| "loss": 0.4674, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.15245913655354396, |
| "grad_norm": 3.4525346755981445, |
| "learning_rate": 9.916644552254417e-06, |
| "loss": 0.4724, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.1529477876322412, |
| "grad_norm": 1.305325984954834, |
| "learning_rate": 9.915086597080407e-06, |
| "loss": 0.468, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.15343643871093846, |
| "grad_norm": 2.5436055660247803, |
| "learning_rate": 9.913514341582415e-06, |
| "loss": 0.4706, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.1539250897896357, |
| "grad_norm": 2.798241376876831, |
| "learning_rate": 9.911927790334882e-06, |
| "loss": 0.4695, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.15441374086833295, |
| "grad_norm": 2.0094220638275146, |
| "learning_rate": 9.910326947953838e-06, |
| "loss": 0.4694, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.15490239194703023, |
| "grad_norm": 2.610715389251709, |
| "learning_rate": 9.908711819096897e-06, |
| "loss": 0.4668, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.15539104302572748, |
| "grad_norm": 4.6221232414245605, |
| "learning_rate": 9.907082408463234e-06, |
| "loss": 0.4679, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.15587969410442473, |
| "grad_norm": 5.538655757904053, |
| "learning_rate": 9.905438720793582e-06, |
| "loss": 0.474, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.15636834518312198, |
| "grad_norm": 2.6583926677703857, |
| "learning_rate": 9.903780760870208e-06, |
| "loss": 0.475, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.15685699626181926, |
| "grad_norm": 4.67283821105957, |
| "learning_rate": 9.902108533516907e-06, |
| "loss": 0.4693, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.1573456473405165, |
| "grad_norm": 2.7513134479522705, |
| "learning_rate": 9.900422043598982e-06, |
| "loss": 0.4675, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.15783429841921376, |
| "grad_norm": 1.7802903652191162, |
| "learning_rate": 9.898721296023234e-06, |
| "loss": 0.466, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.158322949497911, |
| "grad_norm": 2.868180751800537, |
| "learning_rate": 9.89700629573795e-06, |
| "loss": 0.4652, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.15881160057660829, |
| "grad_norm": 2.2115590572357178, |
| "learning_rate": 9.895277047732879e-06, |
| "loss": 0.4649, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.15930025165530554, |
| "grad_norm": 2.7699434757232666, |
| "learning_rate": 9.893533557039223e-06, |
| "loss": 0.466, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.15978890273400279, |
| "grad_norm": 2.4520747661590576, |
| "learning_rate": 9.891775828729628e-06, |
| "loss": 0.4639, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.16027755381270004, |
| "grad_norm": 2.2992360591888428, |
| "learning_rate": 9.890003867918162e-06, |
| "loss": 0.4643, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.16076620489139729, |
| "grad_norm": 2.04976224899292, |
| "learning_rate": 9.888217679760303e-06, |
| "loss": 0.4649, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.16125485597009456, |
| "grad_norm": 1.9434853792190552, |
| "learning_rate": 9.886417269452918e-06, |
| "loss": 0.4665, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.1617435070487918, |
| "grad_norm": 2.6264779567718506, |
| "learning_rate": 9.884602642234257e-06, |
| "loss": 0.4647, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.16223215812748906, |
| "grad_norm": 3.206934690475464, |
| "learning_rate": 9.882773803383934e-06, |
| "loss": 0.4675, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.1627208092061863, |
| "grad_norm": 7.612506866455078, |
| "learning_rate": 9.880930758222912e-06, |
| "loss": 0.4728, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.1632094602848836, |
| "grad_norm": 1.3091853857040405, |
| "learning_rate": 9.879073512113487e-06, |
| "loss": 0.4691, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.16369811136358084, |
| "grad_norm": 3.0943753719329834, |
| "learning_rate": 9.877202070459268e-06, |
| "loss": 0.4657, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.1641867624422781, |
| "grad_norm": 1.4435592889785767, |
| "learning_rate": 9.87531643870517e-06, |
| "loss": 0.465, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.16467541352097534, |
| "grad_norm": 1.4803426265716553, |
| "learning_rate": 9.87341662233739e-06, |
| "loss": 0.4637, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.16516406459967262, |
| "grad_norm": 0.7361840605735779, |
| "learning_rate": 9.871502626883403e-06, |
| "loss": 0.463, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.16565271567836987, |
| "grad_norm": 36.265968322753906, |
| "learning_rate": 9.869574457911925e-06, |
| "loss": 0.4701, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.16614136675706712, |
| "grad_norm": 1.7011586427688599, |
| "learning_rate": 9.86763212103292e-06, |
| "loss": 0.4701, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.16663001783576437, |
| "grad_norm": 2.6717042922973633, |
| "learning_rate": 9.865675621897571e-06, |
| "loss": 0.4644, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.16711866891446162, |
| "grad_norm": 2.1945064067840576, |
| "learning_rate": 9.86370496619826e-06, |
| "loss": 0.4641, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.1676073199931589, |
| "grad_norm": 1.7178106307983398, |
| "learning_rate": 9.861720159668566e-06, |
| "loss": 0.4628, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.16809597107185614, |
| "grad_norm": 1.945646047592163, |
| "learning_rate": 9.85972120808323e-06, |
| "loss": 0.4623, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.1685846221505534, |
| "grad_norm": 1.8980379104614258, |
| "learning_rate": 9.857708117258158e-06, |
| "loss": 0.4621, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.16907327322925064, |
| "grad_norm": 1.8674242496490479, |
| "learning_rate": 9.855680893050384e-06, |
| "loss": 0.4621, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.16956192430794792, |
| "grad_norm": 1.6167813539505005, |
| "learning_rate": 9.853639541358069e-06, |
| "loss": 0.4629, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.17005057538664517, |
| "grad_norm": 2.2286250591278076, |
| "learning_rate": 9.851584068120477e-06, |
| "loss": 0.4634, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.17053922646534242, |
| "grad_norm": 1.4811843633651733, |
| "learning_rate": 9.849514479317955e-06, |
| "loss": 0.4614, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.17102787754403967, |
| "grad_norm": 3.2708358764648438, |
| "learning_rate": 9.84743078097192e-06, |
| "loss": 0.4616, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.17102787754403967, |
| "eval_loss": 0.4388451874256134, |
| "eval_runtime": 729.7082, |
| "eval_samples_per_second": 242.437, |
| "eval_steps_per_second": 0.474, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.17151652862273695, |
| "grad_norm": 2.3546407222747803, |
| "learning_rate": 9.845332979144845e-06, |
| "loss": 0.4629, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.1720051797014342, |
| "grad_norm": 2.703920841217041, |
| "learning_rate": 9.84322107994023e-06, |
| "loss": 0.4624, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.17249383078013145, |
| "grad_norm": 2.2422356605529785, |
| "learning_rate": 9.841095089502595e-06, |
| "loss": 0.4625, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.1729824818588287, |
| "grad_norm": 1.0636337995529175, |
| "learning_rate": 9.838955014017455e-06, |
| "loss": 0.46, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.17347113293752595, |
| "grad_norm": 3.9872353076934814, |
| "learning_rate": 9.836800859711311e-06, |
| "loss": 0.4601, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.17395978401622322, |
| "grad_norm": 1.2745929956436157, |
| "learning_rate": 9.83463263285162e-06, |
| "loss": 0.4628, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.17444843509492047, |
| "grad_norm": 2.2762491703033447, |
| "learning_rate": 9.832450339746785e-06, |
| "loss": 0.4622, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.17493708617361772, |
| "grad_norm": 1.6486016511917114, |
| "learning_rate": 9.830253986746134e-06, |
| "loss": 0.4699, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.17542573725231497, |
| "grad_norm": 1.5666919946670532, |
| "learning_rate": 9.8280435802399e-06, |
| "loss": 0.4646, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.17591438833101225, |
| "grad_norm": 1.8892680406570435, |
| "learning_rate": 9.825819126659214e-06, |
| "loss": 0.4646, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.1764030394097095, |
| "grad_norm": 2.9722862243652344, |
| "learning_rate": 9.823580632476062e-06, |
| "loss": 0.4598, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.17689169048840675, |
| "grad_norm": 2.4820001125335693, |
| "learning_rate": 9.82132810420329e-06, |
| "loss": 0.4629, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.177380341567104, |
| "grad_norm": 2.4330859184265137, |
| "learning_rate": 9.819061548394574e-06, |
| "loss": 0.4611, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.17786899264580128, |
| "grad_norm": 3.4170515537261963, |
| "learning_rate": 9.816780971644403e-06, |
| "loss": 0.4647, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.17835764372449853, |
| "grad_norm": 1.6359800100326538, |
| "learning_rate": 9.814486380588058e-06, |
| "loss": 0.4629, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.17884629480319578, |
| "grad_norm": 1.80881929397583, |
| "learning_rate": 9.812177781901597e-06, |
| "loss": 0.4607, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.17933494588189303, |
| "grad_norm": 1.8495383262634277, |
| "learning_rate": 9.80985518230183e-06, |
| "loss": 0.4598, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.1798235969605903, |
| "grad_norm": 2.4074761867523193, |
| "learning_rate": 9.807518588546305e-06, |
| "loss": 0.4609, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.18031224803928755, |
| "grad_norm": 3.074289321899414, |
| "learning_rate": 9.805168007433283e-06, |
| "loss": 0.4599, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.1808008991179848, |
| "grad_norm": 3.443209648132324, |
| "learning_rate": 9.802803445801723e-06, |
| "loss": 0.4589, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.18128955019668205, |
| "grad_norm": 1.7589000463485718, |
| "learning_rate": 9.800424910531256e-06, |
| "loss": 0.4608, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.1817782012753793, |
| "grad_norm": 1.6772186756134033, |
| "learning_rate": 9.798032408542177e-06, |
| "loss": 0.4614, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.18226685235407658, |
| "grad_norm": 2.250244617462158, |
| "learning_rate": 9.79562594679541e-06, |
| "loss": 0.4601, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.18275550343277383, |
| "grad_norm": 1.436660647392273, |
| "learning_rate": 9.793205532292496e-06, |
| "loss": 0.459, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.18324415451147108, |
| "grad_norm": 2.040019989013672, |
| "learning_rate": 9.79077117207557e-06, |
| "loss": 0.4691, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.18373280559016833, |
| "grad_norm": 2.091820240020752, |
| "learning_rate": 9.788322873227347e-06, |
| "loss": 0.4624, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.1842214566688656, |
| "grad_norm": 2.1219372749328613, |
| "learning_rate": 9.78586064287109e-06, |
| "loss": 0.4614, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.18471010774756286, |
| "grad_norm": 1.5753206014633179, |
| "learning_rate": 9.783384488170598e-06, |
| "loss": 0.4635, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.1851987588262601, |
| "grad_norm": 2.6877732276916504, |
| "learning_rate": 9.780894416330182e-06, |
| "loss": 0.4626, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.18568740990495736, |
| "grad_norm": 1.7835508584976196, |
| "learning_rate": 9.778390434594647e-06, |
| "loss": 0.461, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.18617606098365463, |
| "grad_norm": 2.014145851135254, |
| "learning_rate": 9.775872550249266e-06, |
| "loss": 0.4595, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.18666471206235188, |
| "grad_norm": 1.9438420534133911, |
| "learning_rate": 9.77334077061976e-06, |
| "loss": 0.459, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.18715336314104913, |
| "grad_norm": 1.6419105529785156, |
| "learning_rate": 9.770795103072281e-06, |
| "loss": 0.4572, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.18764201421974638, |
| "grad_norm": 1.0788559913635254, |
| "learning_rate": 9.768235555013385e-06, |
| "loss": 0.4582, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18813066529844363, |
| "grad_norm": 1.149911642074585, |
| "learning_rate": 9.765662133890017e-06, |
| "loss": 0.4573, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.1886193163771409, |
| "grad_norm": 1.5427783727645874, |
| "learning_rate": 9.763074847189483e-06, |
| "loss": 0.4637, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.18910796745583816, |
| "grad_norm": 2.3992674350738525, |
| "learning_rate": 9.760473702439426e-06, |
| "loss": 0.4629, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.1895966185345354, |
| "grad_norm": 2.0136971473693848, |
| "learning_rate": 9.757858707207815e-06, |
| "loss": 0.4584, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.19008526961323266, |
| "grad_norm": 0.9144098162651062, |
| "learning_rate": 9.755229869102916e-06, |
| "loss": 0.4597, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.19057392069192994, |
| "grad_norm": 4.437480449676514, |
| "learning_rate": 9.752587195773268e-06, |
| "loss": 0.4584, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.1910625717706272, |
| "grad_norm": 2.2352585792541504, |
| "learning_rate": 9.749930694907666e-06, |
| "loss": 0.4584, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.19155122284932444, |
| "grad_norm": 1.6085118055343628, |
| "learning_rate": 9.74726037423513e-06, |
| "loss": 0.4598, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.1920398739280217, |
| "grad_norm": 0.8404253721237183, |
| "learning_rate": 9.744576241524895e-06, |
| "loss": 0.4571, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.19252852500671896, |
| "grad_norm": 1.5468897819519043, |
| "learning_rate": 9.741878304586379e-06, |
| "loss": 0.4586, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.19301717608541621, |
| "grad_norm": 3.8875999450683594, |
| "learning_rate": 9.739166571269166e-06, |
| "loss": 0.4601, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.19350582716411346, |
| "grad_norm": 2.8351383209228516, |
| "learning_rate": 9.736441049462973e-06, |
| "loss": 0.4598, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.19399447824281071, |
| "grad_norm": 2.2148053646087646, |
| "learning_rate": 9.733701747097641e-06, |
| "loss": 0.4604, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.19448312932150796, |
| "grad_norm": 2.287990093231201, |
| "learning_rate": 9.730948672143105e-06, |
| "loss": 0.4576, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.19497178040020524, |
| "grad_norm": 1.138305902481079, |
| "learning_rate": 9.728181832609366e-06, |
| "loss": 0.458, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.1954604314789025, |
| "grad_norm": 0.8007479906082153, |
| "learning_rate": 9.725401236546476e-06, |
| "loss": 0.4593, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.1954604314789025, |
| "eval_loss": 0.43675804138183594, |
| "eval_runtime": 728.6855, |
| "eval_samples_per_second": 242.777, |
| "eval_steps_per_second": 0.475, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.19594908255759974, |
| "grad_norm": 3.4049859046936035, |
| "learning_rate": 9.722606892044516e-06, |
| "loss": 0.4573, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.196437733636297, |
| "grad_norm": 1.6767579317092896, |
| "learning_rate": 9.719798807233555e-06, |
| "loss": 0.461, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.19692638471499427, |
| "grad_norm": 1.4466297626495361, |
| "learning_rate": 9.716976990283654e-06, |
| "loss": 0.4629, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.19741503579369152, |
| "grad_norm": 8.470952033996582, |
| "learning_rate": 9.714141449404815e-06, |
| "loss": 0.4857, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.19790368687238877, |
| "grad_norm": 1.1967353820800781, |
| "learning_rate": 9.711292192846979e-06, |
| "loss": 0.4613, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.19839233795108602, |
| "grad_norm": 1.025749921798706, |
| "learning_rate": 9.708429228899984e-06, |
| "loss": 0.4579, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.1988809890297833, |
| "grad_norm": 2.6049964427948, |
| "learning_rate": 9.705552565893557e-06, |
| "loss": 0.46, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.19936964010848054, |
| "grad_norm": 1.4117764234542847, |
| "learning_rate": 9.702662212197277e-06, |
| "loss": 0.4598, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.1998582911871778, |
| "grad_norm": 1.602464199066162, |
| "learning_rate": 9.699758176220558e-06, |
| "loss": 0.4579, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.20034694226587504, |
| "grad_norm": 2.6832380294799805, |
| "learning_rate": 9.696840466412619e-06, |
| "loss": 0.4582, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.20083559334457232, |
| "grad_norm": 1.2473195791244507, |
| "learning_rate": 9.693909091262467e-06, |
| "loss": 0.457, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.20132424442326957, |
| "grad_norm": 1.5877009630203247, |
| "learning_rate": 9.690964059298866e-06, |
| "loss": 0.4565, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.20181289550196682, |
| "grad_norm": 2.6137261390686035, |
| "learning_rate": 9.688005379090315e-06, |
| "loss": 0.4566, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.20230154658066407, |
| "grad_norm": 2.4244110584259033, |
| "learning_rate": 9.68503305924502e-06, |
| "loss": 0.4567, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.20279019765936132, |
| "grad_norm": 2.0475914478302, |
| "learning_rate": 9.682047108410875e-06, |
| "loss": 0.458, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.2032788487380586, |
| "grad_norm": 0.8052435517311096, |
| "learning_rate": 9.679047535275427e-06, |
| "loss": 0.4567, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.20376749981675585, |
| "grad_norm": 3.230631113052368, |
| "learning_rate": 9.676034348565865e-06, |
| "loss": 0.4569, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.2042561508954531, |
| "grad_norm": 2.166372776031494, |
| "learning_rate": 9.673007557048981e-06, |
| "loss": 0.4564, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.20474480197415035, |
| "grad_norm": 1.2645494937896729, |
| "learning_rate": 9.669967169531148e-06, |
| "loss": 0.4547, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.20523345305284763, |
| "grad_norm": 2.206819772720337, |
| "learning_rate": 9.666913194858301e-06, |
| "loss": 0.4563, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.20572210413154488, |
| "grad_norm": 0.8847692608833313, |
| "learning_rate": 9.663845641915901e-06, |
| "loss": 0.4581, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.20621075521024212, |
| "grad_norm": 2.756206512451172, |
| "learning_rate": 9.660764519628925e-06, |
| "loss": 0.458, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.20669940628893937, |
| "grad_norm": 8.863795280456543, |
| "learning_rate": 9.657669836961816e-06, |
| "loss": 0.458, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.20718805736763665, |
| "grad_norm": 1.3385523557662964, |
| "learning_rate": 9.654561602918481e-06, |
| "loss": 0.4597, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.2076767084463339, |
| "grad_norm": 0.9093275666236877, |
| "learning_rate": 9.651439826542252e-06, |
| "loss": 0.4561, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.20816535952503115, |
| "grad_norm": 5.106472492218018, |
| "learning_rate": 9.648304516915856e-06, |
| "loss": 0.457, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.2086540106037284, |
| "grad_norm": 1.5481184720993042, |
| "learning_rate": 9.645155683161405e-06, |
| "loss": 0.4607, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.20914266168242565, |
| "grad_norm": 1.1324431896209717, |
| "learning_rate": 9.641993334440349e-06, |
| "loss": 0.4578, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.20963131276112293, |
| "grad_norm": 2.1077959537506104, |
| "learning_rate": 9.638817479953466e-06, |
| "loss": 0.4551, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.21011996383982018, |
| "grad_norm": 1.6443114280700684, |
| "learning_rate": 9.635628128940827e-06, |
| "loss": 0.4564, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.21060861491851743, |
| "grad_norm": 6.366663455963135, |
| "learning_rate": 9.632425290681771e-06, |
| "loss": 0.455, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.21109726599721468, |
| "grad_norm": 1.1640760898590088, |
| "learning_rate": 9.629208974494876e-06, |
| "loss": 0.4568, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.21158591707591196, |
| "grad_norm": 1.8168015480041504, |
| "learning_rate": 9.625979189737935e-06, |
| "loss": 0.4551, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.2120745681546092, |
| "grad_norm": 1.9787592887878418, |
| "learning_rate": 9.62273594580793e-06, |
| "loss": 0.4578, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.21256321923330646, |
| "grad_norm": 1.0346274375915527, |
| "learning_rate": 9.619479252141e-06, |
| "loss": 0.4559, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.2130518703120037, |
| "grad_norm": 1.2450488805770874, |
| "learning_rate": 9.61620911821241e-06, |
| "loss": 0.454, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.21354052139070098, |
| "grad_norm": 6.733485221862793, |
| "learning_rate": 9.61292555353654e-06, |
| "loss": 0.4844, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.21402917246939823, |
| "grad_norm": 3.55191707611084, |
| "learning_rate": 9.609628567666838e-06, |
| "loss": 0.4783, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.21451782354809548, |
| "grad_norm": 1.3721413612365723, |
| "learning_rate": 9.606318170195805e-06, |
| "loss": 0.4614, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.21500647462679273, |
| "grad_norm": 1.7321209907531738, |
| "learning_rate": 9.602994370754962e-06, |
| "loss": 0.457, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.21549512570548998, |
| "grad_norm": 4.538996696472168, |
| "learning_rate": 9.599657179014821e-06, |
| "loss": 0.4573, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.21598377678418726, |
| "grad_norm": 1.3797237873077393, |
| "learning_rate": 9.596306604684859e-06, |
| "loss": 0.4569, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.2164724278628845, |
| "grad_norm": 1.5394078493118286, |
| "learning_rate": 9.59294265751349e-06, |
| "loss": 0.454, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.21696107894158176, |
| "grad_norm": 5.713876247406006, |
| "learning_rate": 9.589565347288036e-06, |
| "loss": 0.4559, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.217449730020279, |
| "grad_norm": 1.5455598831176758, |
| "learning_rate": 9.5861746838347e-06, |
| "loss": 0.4556, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.21793838109897629, |
| "grad_norm": 1.5440407991409302, |
| "learning_rate": 9.58277067701853e-06, |
| "loss": 0.4548, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.21842703217767354, |
| "grad_norm": 2.495877981185913, |
| "learning_rate": 9.579353336743406e-06, |
| "loss": 0.4551, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.21891568325637079, |
| "grad_norm": 1.8251252174377441, |
| "learning_rate": 9.575922672951992e-06, |
| "loss": 0.4543, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.21940433433506804, |
| "grad_norm": 1.808957815170288, |
| "learning_rate": 9.572478695625722e-06, |
| "loss": 0.4533, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.2198929854137653, |
| "grad_norm": 1.994743824005127, |
| "learning_rate": 9.56902141478476e-06, |
| "loss": 0.4536, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.2198929854137653, |
| "eval_loss": 0.42861247062683105, |
| "eval_runtime": 729.5786, |
| "eval_samples_per_second": 242.48, |
| "eval_steps_per_second": 0.474, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.22038163649246256, |
| "grad_norm": 1.9364984035491943, |
| "learning_rate": 9.565550840487987e-06, |
| "loss": 0.4548, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.2208702875711598, |
| "grad_norm": 2.0791361331939697, |
| "learning_rate": 9.562066982832945e-06, |
| "loss": 0.4546, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.22135893864985706, |
| "grad_norm": 1.759068489074707, |
| "learning_rate": 9.55856985195584e-06, |
| "loss": 0.455, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.2218475897285543, |
| "grad_norm": 1.7903980016708374, |
| "learning_rate": 9.555059458031485e-06, |
| "loss": 0.4536, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.2223362408072516, |
| "grad_norm": 1.3520255088806152, |
| "learning_rate": 9.551535811273285e-06, |
| "loss": 0.4521, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.22282489188594884, |
| "grad_norm": 1.4286073446273804, |
| "learning_rate": 9.547998921933203e-06, |
| "loss": 0.4541, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.2233135429646461, |
| "grad_norm": 1.2026102542877197, |
| "learning_rate": 9.544448800301736e-06, |
| "loss": 0.4531, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.22380219404334334, |
| "grad_norm": 3.257838010787964, |
| "learning_rate": 9.54088545670787e-06, |
| "loss": 0.4614, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.22429084512204062, |
| "grad_norm": 1.2527670860290527, |
| "learning_rate": 9.537308901519073e-06, |
| "loss": 0.4606, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.22477949620073787, |
| "grad_norm": 4.201780319213867, |
| "learning_rate": 9.533719145141239e-06, |
| "loss": 0.4577, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.22526814727943512, |
| "grad_norm": 1.9157164096832275, |
| "learning_rate": 9.530116198018677e-06, |
| "loss": 0.4566, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.22575679835813237, |
| "grad_norm": 1.9841718673706055, |
| "learning_rate": 9.526500070634075e-06, |
| "loss": 0.4561, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.22624544943682964, |
| "grad_norm": 1.592416524887085, |
| "learning_rate": 9.522870773508466e-06, |
| "loss": 0.4538, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.2267341005155269, |
| "grad_norm": 1.8579721450805664, |
| "learning_rate": 9.519228317201201e-06, |
| "loss": 0.4565, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.22722275159422414, |
| "grad_norm": 0.8354905247688293, |
| "learning_rate": 9.51557271230992e-06, |
| "loss": 0.4535, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.2277114026729214, |
| "grad_norm": 1.6300770044326782, |
| "learning_rate": 9.51190396947051e-06, |
| "loss": 0.4555, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.22820005375161867, |
| "grad_norm": 2.1904120445251465, |
| "learning_rate": 9.508222099357094e-06, |
| "loss": 0.455, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.22868870483031592, |
| "grad_norm": 2.8534996509552, |
| "learning_rate": 9.504527112681978e-06, |
| "loss": 0.4551, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.22917735590901317, |
| "grad_norm": 1.0719540119171143, |
| "learning_rate": 9.50081902019564e-06, |
| "loss": 0.4531, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.22966600698771042, |
| "grad_norm": 1.4179500341415405, |
| "learning_rate": 9.497097832686682e-06, |
| "loss": 0.4531, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.23015465806640767, |
| "grad_norm": 3.2865960597991943, |
| "learning_rate": 9.493363560981808e-06, |
| "loss": 0.4531, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.23064330914510495, |
| "grad_norm": 1.4232662916183472, |
| "learning_rate": 9.489616215945788e-06, |
| "loss": 0.4542, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.2311319602238022, |
| "grad_norm": 1.7004929780960083, |
| "learning_rate": 9.485855808481434e-06, |
| "loss": 0.4537, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.23162061130249945, |
| "grad_norm": 1.8315871953964233, |
| "learning_rate": 9.482082349529558e-06, |
| "loss": 0.4544, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.2321092623811967, |
| "grad_norm": 1.7571625709533691, |
| "learning_rate": 9.478295850068945e-06, |
| "loss": 0.4528, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.23259791345989397, |
| "grad_norm": 2.470423936843872, |
| "learning_rate": 9.474496321116324e-06, |
| "loss": 0.4523, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.23308656453859122, |
| "grad_norm": 1.8932669162750244, |
| "learning_rate": 9.470683773726331e-06, |
| "loss": 0.4543, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.23357521561728847, |
| "grad_norm": 0.8342353105545044, |
| "learning_rate": 9.466858218991477e-06, |
| "loss": 0.4537, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.23406386669598572, |
| "grad_norm": 5.9539055824279785, |
| "learning_rate": 9.463019668042123e-06, |
| "loss": 0.4672, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.234552517774683, |
| "grad_norm": 1.770120620727539, |
| "learning_rate": 9.459168132046438e-06, |
| "loss": 0.4571, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.23504116885338025, |
| "grad_norm": 1.4648096561431885, |
| "learning_rate": 9.455303622210371e-06, |
| "loss": 0.4557, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.2355298199320775, |
| "grad_norm": 1.6342428922653198, |
| "learning_rate": 9.451426149777617e-06, |
| "loss": 0.4531, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.23601847101077475, |
| "grad_norm": 4.125144958496094, |
| "learning_rate": 9.447535726029593e-06, |
| "loss": 0.4532, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.236507122089472, |
| "grad_norm": 1.3270002603530884, |
| "learning_rate": 9.443632362285385e-06, |
| "loss": 0.4571, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.23699577316816928, |
| "grad_norm": 1.3583444356918335, |
| "learning_rate": 9.439716069901735e-06, |
| "loss": 0.4553, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.23748442424686653, |
| "grad_norm": 1.0656195878982544, |
| "learning_rate": 9.435786860273003e-06, |
| "loss": 0.4501, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.23797307532556378, |
| "grad_norm": 1.0093538761138916, |
| "learning_rate": 9.431844744831126e-06, |
| "loss": 0.4525, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.23846172640426103, |
| "grad_norm": 1.5308141708374023, |
| "learning_rate": 9.427889735045593e-06, |
| "loss": 0.4533, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.2389503774829583, |
| "grad_norm": 1.855191707611084, |
| "learning_rate": 9.423921842423406e-06, |
| "loss": 0.454, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.23943902856165555, |
| "grad_norm": 1.1727728843688965, |
| "learning_rate": 9.419941078509054e-06, |
| "loss": 0.4523, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.2399276796403528, |
| "grad_norm": 0.5924420356750488, |
| "learning_rate": 9.415947454884471e-06, |
| "loss": 0.4522, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.24041633071905005, |
| "grad_norm": 2.744570732116699, |
| "learning_rate": 9.411940983169006e-06, |
| "loss": 0.4529, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.24090498179774733, |
| "grad_norm": 1.7564640045166016, |
| "learning_rate": 9.407921675019393e-06, |
| "loss": 0.4532, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.24139363287644458, |
| "grad_norm": 1.4309080839157104, |
| "learning_rate": 9.403889542129707e-06, |
| "loss": 0.4533, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.24188228395514183, |
| "grad_norm": 1.6379081010818481, |
| "learning_rate": 9.399844596231343e-06, |
| "loss": 0.4515, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.24237093503383908, |
| "grad_norm": 0.7086363434791565, |
| "learning_rate": 9.39578684909297e-06, |
| "loss": 0.4527, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.24285958611253633, |
| "grad_norm": 0.987898051738739, |
| "learning_rate": 9.391716312520503e-06, |
| "loss": 0.453, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.2433482371912336, |
| "grad_norm": 1.1785643100738525, |
| "learning_rate": 9.387632998357073e-06, |
| "loss": 0.4532, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.24383688826993086, |
| "grad_norm": 2.234311819076538, |
| "learning_rate": 9.383536918482976e-06, |
| "loss": 0.4541, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.2443255393486281, |
| "grad_norm": 0.9595701098442078, |
| "learning_rate": 9.37942808481566e-06, |
| "loss": 0.4532, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2443255393486281, |
| "eval_loss": 0.42559880018234253, |
| "eval_runtime": 729.4388, |
| "eval_samples_per_second": 242.526, |
| "eval_steps_per_second": 0.474, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.24481419042732536, |
| "grad_norm": 1.8065398931503296, |
| "learning_rate": 9.375306509309676e-06, |
| "loss": 0.4532, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.24530284150602263, |
| "grad_norm": 1.7283066511154175, |
| "learning_rate": 9.371172203956646e-06, |
| "loss": 0.4534, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.24579149258471988, |
| "grad_norm": 1.2136019468307495, |
| "learning_rate": 9.367025180785229e-06, |
| "loss": 0.4536, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.24628014366341713, |
| "grad_norm": 0.9906538724899292, |
| "learning_rate": 9.36286545186109e-06, |
| "loss": 0.4536, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.24676879474211438, |
| "grad_norm": 1.390766978263855, |
| "learning_rate": 9.358693029286855e-06, |
| "loss": 0.4514, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.24725744582081166, |
| "grad_norm": 1.2268085479736328, |
| "learning_rate": 9.354507925202088e-06, |
| "loss": 0.4516, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.2477460968995089, |
| "grad_norm": 2.122887372970581, |
| "learning_rate": 9.350310151783244e-06, |
| "loss": 0.4491, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.24823474797820616, |
| "grad_norm": 1.7110397815704346, |
| "learning_rate": 9.346099721243646e-06, |
| "loss": 0.4522, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.2487233990569034, |
| "grad_norm": 0.9016057252883911, |
| "learning_rate": 9.341876645833434e-06, |
| "loss": 0.4515, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.2492120501356007, |
| "grad_norm": 1.6355116367340088, |
| "learning_rate": 9.337640937839544e-06, |
| "loss": 0.4545, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.24970070121429794, |
| "grad_norm": 0.9655557870864868, |
| "learning_rate": 9.333392609585667e-06, |
| "loss": 0.455, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.25018935229299516, |
| "grad_norm": 1.8593004941940308, |
| "learning_rate": 9.329131673432208e-06, |
| "loss": 0.4522, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.25067800337169244, |
| "grad_norm": 1.764728307723999, |
| "learning_rate": 9.324858141776254e-06, |
| "loss": 0.4541, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.2511666544503897, |
| "grad_norm": 0.7554106116294861, |
| "learning_rate": 9.320572027051544e-06, |
| "loss": 0.4566, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.25165530552908694, |
| "grad_norm": 2.081536054611206, |
| "learning_rate": 9.316273341728423e-06, |
| "loss": 0.4518, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.2521439566077842, |
| "grad_norm": 0.8827464580535889, |
| "learning_rate": 9.311962098313809e-06, |
| "loss": 0.4502, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.2526326076864815, |
| "grad_norm": 0.9885526895523071, |
| "learning_rate": 9.307638309351162e-06, |
| "loss": 0.4533, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.2531212587651787, |
| "grad_norm": 1.7395132780075073, |
| "learning_rate": 9.303301987420436e-06, |
| "loss": 0.4516, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.253609909843876, |
| "grad_norm": 0.679470419883728, |
| "learning_rate": 9.298953145138057e-06, |
| "loss": 0.4514, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.2540985609225732, |
| "grad_norm": 1.4864360094070435, |
| "learning_rate": 9.294591795156873e-06, |
| "loss": 0.4502, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.2545872120012705, |
| "grad_norm": 1.3630485534667969, |
| "learning_rate": 9.290217950166125e-06, |
| "loss": 0.4508, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.25507586307996777, |
| "grad_norm": 1.3194668292999268, |
| "learning_rate": 9.285831622891409e-06, |
| "loss": 0.4511, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.255564514158665, |
| "grad_norm": 1.602607250213623, |
| "learning_rate": 9.281432826094635e-06, |
| "loss": 0.4523, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.25605316523736227, |
| "grad_norm": 0.8694007992744446, |
| "learning_rate": 9.277021572573996e-06, |
| "loss": 0.4522, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.2565418163160595, |
| "grad_norm": 0.9949777722358704, |
| "learning_rate": 9.272597875163925e-06, |
| "loss": 0.4532, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.25703046739475677, |
| "grad_norm": 1.0901665687561035, |
| "learning_rate": 9.268161746735063e-06, |
| "loss": 0.4509, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.25751911847345405, |
| "grad_norm": 0.87218177318573, |
| "learning_rate": 9.263713200194212e-06, |
| "loss": 0.4506, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.25800776955215127, |
| "grad_norm": 2.6825311183929443, |
| "learning_rate": 9.259252248484317e-06, |
| "loss": 0.4508, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.25849642063084854, |
| "grad_norm": 1.7756139039993286, |
| "learning_rate": 9.2547789045844e-06, |
| "loss": 0.4524, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.2589850717095458, |
| "grad_norm": 1.459425449371338, |
| "learning_rate": 9.250293181509551e-06, |
| "loss": 0.4525, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.25947372278824304, |
| "grad_norm": 0.5840021371841431, |
| "learning_rate": 9.245795092310867e-06, |
| "loss": 0.4508, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.2599623738669403, |
| "grad_norm": 1.1396574974060059, |
| "learning_rate": 9.241284650075432e-06, |
| "loss": 0.4498, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.26045102494563754, |
| "grad_norm": 2.9981930255889893, |
| "learning_rate": 9.236761867926264e-06, |
| "loss": 0.4538, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.2609396760243348, |
| "grad_norm": 1.627025842666626, |
| "learning_rate": 9.23222675902229e-06, |
| "loss": 0.4542, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.2614283271030321, |
| "grad_norm": 2.1768600940704346, |
| "learning_rate": 9.227679336558295e-06, |
| "loss": 0.4514, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.2619169781817293, |
| "grad_norm": 0.6379441618919373, |
| "learning_rate": 9.223119613764895e-06, |
| "loss": 0.4504, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.2624056292604266, |
| "grad_norm": 1.7971820831298828, |
| "learning_rate": 9.21854760390849e-06, |
| "loss": 0.4503, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.2628942803391239, |
| "grad_norm": 2.099776029586792, |
| "learning_rate": 9.213963320291232e-06, |
| "loss": 0.4509, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.2633829314178211, |
| "grad_norm": 1.0017653703689575, |
| "learning_rate": 9.209366776250984e-06, |
| "loss": 0.4504, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.2638715824965184, |
| "grad_norm": 1.0879532098770142, |
| "learning_rate": 9.204757985161274e-06, |
| "loss": 0.4501, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.2643602335752156, |
| "grad_norm": 1.28214693069458, |
| "learning_rate": 9.20013696043127e-06, |
| "loss": 0.4483, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.2648488846539129, |
| "grad_norm": 2.457913398742676, |
| "learning_rate": 9.195503715505729e-06, |
| "loss": 0.4517, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.26533753573261015, |
| "grad_norm": 0.9251576662063599, |
| "learning_rate": 9.190858263864963e-06, |
| "loss": 0.4515, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.2658261868113074, |
| "grad_norm": 1.5031663179397583, |
| "learning_rate": 9.1862006190248e-06, |
| "loss": 0.4499, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.26631483789000465, |
| "grad_norm": 1.5385842323303223, |
| "learning_rate": 9.181530794536544e-06, |
| "loss": 0.4497, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.2668034889687019, |
| "grad_norm": 1.0565071105957031, |
| "learning_rate": 9.176848803986934e-06, |
| "loss": 0.451, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.26729214004739915, |
| "grad_norm": 0.9009528756141663, |
| "learning_rate": 9.172154660998108e-06, |
| "loss": 0.4507, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.26778079112609643, |
| "grad_norm": 0.7359398007392883, |
| "learning_rate": 9.167448379227558e-06, |
| "loss": 0.4493, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.26826944220479365, |
| "grad_norm": 4.481854438781738, |
| "learning_rate": 9.162729972368098e-06, |
| "loss": 0.4516, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.26875809328349093, |
| "grad_norm": 1.0901057720184326, |
| "learning_rate": 9.157999454147814e-06, |
| "loss": 0.4518, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.26875809328349093, |
| "eval_loss": 0.4273635745048523, |
| "eval_runtime": 728.6534, |
| "eval_samples_per_second": 242.788, |
| "eval_steps_per_second": 0.475, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.2692467443621882, |
| "grad_norm": 1.3341872692108154, |
| "learning_rate": 9.153256838330035e-06, |
| "loss": 0.4499, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.26973539544088543, |
| "grad_norm": 1.7751141786575317, |
| "learning_rate": 9.148502138713286e-06, |
| "loss": 0.4491, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.2702240465195827, |
| "grad_norm": 1.0976356267929077, |
| "learning_rate": 9.143735369131249e-06, |
| "loss": 0.4496, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.2707126975982799, |
| "grad_norm": 2.7799429893493652, |
| "learning_rate": 9.13895654345272e-06, |
| "loss": 0.4501, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.2712013486769772, |
| "grad_norm": 1.4997122287750244, |
| "learning_rate": 9.134165675581579e-06, |
| "loss": 0.4494, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.2716899997556745, |
| "grad_norm": 1.3157509565353394, |
| "learning_rate": 9.129362779456737e-06, |
| "loss": 0.4505, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.2721786508343717, |
| "grad_norm": 2.182624101638794, |
| "learning_rate": 9.124547869052103e-06, |
| "loss": 0.4499, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.272667301913069, |
| "grad_norm": 0.6629562377929688, |
| "learning_rate": 9.11972095837654e-06, |
| "loss": 0.4501, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.2731559529917662, |
| "grad_norm": 0.7715067863464355, |
| "learning_rate": 9.114882061473827e-06, |
| "loss": 0.4496, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.2736446040704635, |
| "grad_norm": 1.0679346323013306, |
| "learning_rate": 9.110031192422613e-06, |
| "loss": 0.4488, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.27413325514916076, |
| "grad_norm": 2.0973806381225586, |
| "learning_rate": 9.105168365336389e-06, |
| "loss": 0.4505, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.274621906227858, |
| "grad_norm": 1.7515530586242676, |
| "learning_rate": 9.100293594363425e-06, |
| "loss": 0.4498, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.27511055730655526, |
| "grad_norm": 1.3219352960586548, |
| "learning_rate": 9.095406893686752e-06, |
| "loss": 0.45, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.27559920838525254, |
| "grad_norm": 1.7914499044418335, |
| "learning_rate": 9.090508277524103e-06, |
| "loss": 0.4506, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.27608785946394976, |
| "grad_norm": 1.048553228378296, |
| "learning_rate": 9.085597760127884e-06, |
| "loss": 0.4479, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.27657651054264704, |
| "grad_norm": 0.9424349069595337, |
| "learning_rate": 9.080675355785123e-06, |
| "loss": 0.4479, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.27706516162134426, |
| "grad_norm": 2.2007129192352295, |
| "learning_rate": 9.075741078817435e-06, |
| "loss": 0.4517, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.27755381270004154, |
| "grad_norm": 1.4200412034988403, |
| "learning_rate": 9.070794943580978e-06, |
| "loss": 0.4503, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.2780424637787388, |
| "grad_norm": 3.359553575515747, |
| "learning_rate": 9.065836964466412e-06, |
| "loss": 0.4504, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.27853111485743604, |
| "grad_norm": 1.0638636350631714, |
| "learning_rate": 9.060867155898856e-06, |
| "loss": 0.4503, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.2790197659361333, |
| "grad_norm": 1.592399001121521, |
| "learning_rate": 9.055885532337847e-06, |
| "loss": 0.4485, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.27950841701483053, |
| "grad_norm": 0.6336447596549988, |
| "learning_rate": 9.050892108277292e-06, |
| "loss": 0.4486, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.2799970680935278, |
| "grad_norm": 2.1107187271118164, |
| "learning_rate": 9.045886898245441e-06, |
| "loss": 0.451, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.2804857191722251, |
| "grad_norm": 1.656101107597351, |
| "learning_rate": 9.040869916804827e-06, |
| "loss": 0.4494, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.2809743702509223, |
| "grad_norm": 1.3328661918640137, |
| "learning_rate": 9.035841178552236e-06, |
| "loss": 0.4492, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.2814630213296196, |
| "grad_norm": 0.48556625843048096, |
| "learning_rate": 9.030800698118658e-06, |
| "loss": 0.4494, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.28195167240831687, |
| "grad_norm": 2.595662832260132, |
| "learning_rate": 9.025748490169248e-06, |
| "loss": 0.4498, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.2824403234870141, |
| "grad_norm": 0.8997907042503357, |
| "learning_rate": 9.02068456940328e-06, |
| "loss": 0.4482, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.28292897456571137, |
| "grad_norm": 1.9101444482803345, |
| "learning_rate": 9.01560895055411e-06, |
| "loss": 0.4495, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.2834176256444086, |
| "grad_norm": 0.7567463517189026, |
| "learning_rate": 9.010521648389122e-06, |
| "loss": 0.4501, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.28390627672310587, |
| "grad_norm": 3.035726547241211, |
| "learning_rate": 9.005422677709701e-06, |
| "loss": 0.4499, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.28439492780180314, |
| "grad_norm": 1.5301775932312012, |
| "learning_rate": 9.000312053351175e-06, |
| "loss": 0.4484, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.28488357888050037, |
| "grad_norm": 1.8312554359436035, |
| "learning_rate": 8.995189790182782e-06, |
| "loss": 0.4486, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.28537222995919764, |
| "grad_norm": 1.362288236618042, |
| "learning_rate": 8.99005590310762e-06, |
| "loss": 0.4497, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.28586088103789487, |
| "grad_norm": 1.4402492046356201, |
| "learning_rate": 8.984910407062608e-06, |
| "loss": 0.4496, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.28634953211659214, |
| "grad_norm": 0.9459155201911926, |
| "learning_rate": 8.97975331701844e-06, |
| "loss": 0.4485, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.2868381831952894, |
| "grad_norm": 1.4187127351760864, |
| "learning_rate": 8.974584647979546e-06, |
| "loss": 0.449, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.28732683427398664, |
| "grad_norm": 2.6295182704925537, |
| "learning_rate": 8.969404414984035e-06, |
| "loss": 0.4493, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.2878154853526839, |
| "grad_norm": 1.6124824285507202, |
| "learning_rate": 8.964212633103674e-06, |
| "loss": 0.4496, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.2883041364313812, |
| "grad_norm": 0.6683453321456909, |
| "learning_rate": 8.959009317443825e-06, |
| "loss": 0.4484, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.2887927875100784, |
| "grad_norm": 1.6014492511749268, |
| "learning_rate": 8.953794483143406e-06, |
| "loss": 0.4483, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.2892814385887757, |
| "grad_norm": 1.014033317565918, |
| "learning_rate": 8.948568145374849e-06, |
| "loss": 0.449, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.2897700896674729, |
| "grad_norm": 1.4940074682235718, |
| "learning_rate": 8.943330319344055e-06, |
| "loss": 0.4496, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.2902587407461702, |
| "grad_norm": 0.863261342048645, |
| "learning_rate": 8.938081020290352e-06, |
| "loss": 0.4495, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.2907473918248675, |
| "grad_norm": 1.5809766054153442, |
| "learning_rate": 8.932820263486447e-06, |
| "loss": 0.4493, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.2912360429035647, |
| "grad_norm": 0.7684280276298523, |
| "learning_rate": 8.927548064238383e-06, |
| "loss": 0.4492, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.291724693982262, |
| "grad_norm": 2.1927716732025146, |
| "learning_rate": 8.922264437885492e-06, |
| "loss": 0.451, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.2922133450609592, |
| "grad_norm": 1.0817362070083618, |
| "learning_rate": 8.916969399800359e-06, |
| "loss": 0.4506, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.2927019961396565, |
| "grad_norm": 0.7948960661888123, |
| "learning_rate": 8.911662965388765e-06, |
| "loss": 0.4499, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.29319064721835375, |
| "grad_norm": 0.9926490187644958, |
| "learning_rate": 8.906345150089652e-06, |
| "loss": 0.4486, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.29319064721835375, |
| "eval_loss": 0.4233919382095337, |
| "eval_runtime": 728.6738, |
| "eval_samples_per_second": 242.781, |
| "eval_steps_per_second": 0.475, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.293679298297051, |
| "grad_norm": 1.070708155632019, |
| "learning_rate": 8.901015969375074e-06, |
| "loss": 0.4497, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.29416794937574825, |
| "grad_norm": 1.2505017518997192, |
| "learning_rate": 8.89567543875015e-06, |
| "loss": 0.4479, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.2946566004544455, |
| "grad_norm": 0.6546292304992676, |
| "learning_rate": 8.890323573753023e-06, |
| "loss": 0.4495, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.29514525153314275, |
| "grad_norm": 5.781423091888428, |
| "learning_rate": 8.884960389954813e-06, |
| "loss": 0.4478, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.29563390261184, |
| "grad_norm": 1.123044490814209, |
| "learning_rate": 8.879585902959573e-06, |
| "loss": 0.4493, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.29612255369053725, |
| "grad_norm": 1.8155452013015747, |
| "learning_rate": 8.874200128404242e-06, |
| "loss": 0.4504, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.2966112047692345, |
| "grad_norm": 1.4578708410263062, |
| "learning_rate": 8.868803081958597e-06, |
| "loss": 0.4503, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.2970998558479318, |
| "grad_norm": 1.241621971130371, |
| "learning_rate": 8.863394779325212e-06, |
| "loss": 0.4495, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.297588506926629, |
| "grad_norm": 0.9442185759544373, |
| "learning_rate": 8.857975236239412e-06, |
| "loss": 0.4484, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.2980771580053263, |
| "grad_norm": 1.3439468145370483, |
| "learning_rate": 8.852544468469224e-06, |
| "loss": 0.4488, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.2985658090840235, |
| "grad_norm": 2.7450032234191895, |
| "learning_rate": 8.847102491815336e-06, |
| "loss": 0.4488, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.2990544601627208, |
| "grad_norm": 1.1001813411712646, |
| "learning_rate": 8.841649322111044e-06, |
| "loss": 0.4501, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.2995431112414181, |
| "grad_norm": 0.6491206884384155, |
| "learning_rate": 8.836184975222212e-06, |
| "loss": 0.4474, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.3000317623201153, |
| "grad_norm": 0.40915462374687195, |
| "learning_rate": 8.830709467047223e-06, |
| "loss": 0.4486, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.3005204133988126, |
| "grad_norm": 0.9558333158493042, |
| "learning_rate": 8.825222813516933e-06, |
| "loss": 0.4468, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.30100906447750986, |
| "grad_norm": 1.2985563278198242, |
| "learning_rate": 8.819725030594626e-06, |
| "loss": 0.4484, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.3014977155562071, |
| "grad_norm": 1.1261284351348877, |
| "learning_rate": 8.81421613427597e-06, |
| "loss": 0.4493, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.30198636663490436, |
| "grad_norm": 1.677819848060608, |
| "learning_rate": 8.80869614058896e-06, |
| "loss": 0.4476, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.3024750177136016, |
| "grad_norm": 1.6651966571807861, |
| "learning_rate": 8.803165065593884e-06, |
| "loss": 0.4473, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.30296366879229886, |
| "grad_norm": 0.8978771567344666, |
| "learning_rate": 8.797622925383267e-06, |
| "loss": 0.4478, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.30345231987099613, |
| "grad_norm": 0.6011471748352051, |
| "learning_rate": 8.792069736081835e-06, |
| "loss": 0.4478, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.30394097094969336, |
| "grad_norm": 3.1353821754455566, |
| "learning_rate": 8.78650551384645e-06, |
| "loss": 0.4515, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.30442962202839063, |
| "grad_norm": 1.1291117668151855, |
| "learning_rate": 8.780930274866084e-06, |
| "loss": 0.4498, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.3049182731070879, |
| "grad_norm": 0.6393253803253174, |
| "learning_rate": 8.775344035361758e-06, |
| "loss": 0.4489, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.30540692418578513, |
| "grad_norm": 1.493739366531372, |
| "learning_rate": 8.7697468115865e-06, |
| "loss": 0.4498, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.3058955752644824, |
| "grad_norm": 1.8243303298950195, |
| "learning_rate": 8.76413861982529e-06, |
| "loss": 0.4492, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.30638422634317963, |
| "grad_norm": 0.7140172719955444, |
| "learning_rate": 8.758519476395029e-06, |
| "loss": 0.4478, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.3068728774218769, |
| "grad_norm": 0.9651872515678406, |
| "learning_rate": 8.752889397644478e-06, |
| "loss": 0.4484, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.3073615285005742, |
| "grad_norm": 0.4499496817588806, |
| "learning_rate": 8.747248399954212e-06, |
| "loss": 0.4475, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.3078501795792714, |
| "grad_norm": 1.09201180934906, |
| "learning_rate": 8.741596499736573e-06, |
| "loss": 0.4491, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.3083388306579687, |
| "grad_norm": 0.835132360458374, |
| "learning_rate": 8.735933713435627e-06, |
| "loss": 0.4479, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.3088274817366659, |
| "grad_norm": 0.7163196802139282, |
| "learning_rate": 8.730260057527116e-06, |
| "loss": 0.4484, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.3093161328153632, |
| "grad_norm": 1.1830068826675415, |
| "learning_rate": 8.724575548518397e-06, |
| "loss": 0.4475, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.30980478389406046, |
| "grad_norm": 1.2740248441696167, |
| "learning_rate": 8.718880202948414e-06, |
| "loss": 0.447, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.3102934349727577, |
| "grad_norm": 1.1490364074707031, |
| "learning_rate": 8.713174037387633e-06, |
| "loss": 0.447, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.31078208605145496, |
| "grad_norm": 1.9249966144561768, |
| "learning_rate": 8.707457068438004e-06, |
| "loss": 0.4477, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.31127073713015224, |
| "grad_norm": 1.1233280897140503, |
| "learning_rate": 8.701729312732907e-06, |
| "loss": 0.45, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.31175938820884946, |
| "grad_norm": 0.5614790916442871, |
| "learning_rate": 8.695990786937109e-06, |
| "loss": 0.447, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.31224803928754674, |
| "grad_norm": 0.8090300559997559, |
| "learning_rate": 8.690241507746706e-06, |
| "loss": 0.4493, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.31273669036624396, |
| "grad_norm": 0.9170634746551514, |
| "learning_rate": 8.68448149188909e-06, |
| "loss": 0.4479, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.31322534144494124, |
| "grad_norm": 0.8162520527839661, |
| "learning_rate": 8.67871075612288e-06, |
| "loss": 0.4473, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.3137139925236385, |
| "grad_norm": 2.09964656829834, |
| "learning_rate": 8.672929317237897e-06, |
| "loss": 0.4466, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.31420264360233574, |
| "grad_norm": 1.2079427242279053, |
| "learning_rate": 8.667137192055093e-06, |
| "loss": 0.4483, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.314691294681033, |
| "grad_norm": 0.8319594860076904, |
| "learning_rate": 8.661334397426511e-06, |
| "loss": 0.4457, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.31517994575973024, |
| "grad_norm": 1.2110413312911987, |
| "learning_rate": 8.655520950235243e-06, |
| "loss": 0.449, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.3156685968384275, |
| "grad_norm": 1.1097526550292969, |
| "learning_rate": 8.649696867395372e-06, |
| "loss": 0.4482, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.3161572479171248, |
| "grad_norm": 0.4162759482860565, |
| "learning_rate": 8.643862165851922e-06, |
| "loss": 0.4465, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.316645898995822, |
| "grad_norm": 0.8267191052436829, |
| "learning_rate": 8.638016862580814e-06, |
| "loss": 0.4469, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.3171345500745193, |
| "grad_norm": 1.518624186515808, |
| "learning_rate": 8.632160974588817e-06, |
| "loss": 0.4482, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.31762320115321657, |
| "grad_norm": 0.7973819375038147, |
| "learning_rate": 8.62629451891349e-06, |
| "loss": 0.448, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.31762320115321657, |
| "eval_loss": 0.4212668538093567, |
| "eval_runtime": 728.4104, |
| "eval_samples_per_second": 242.869, |
| "eval_steps_per_second": 0.475, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3181118522319138, |
| "grad_norm": 1.7393572330474854, |
| "learning_rate": 8.620417512623145e-06, |
| "loss": 0.4462, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.31860050331061107, |
| "grad_norm": 0.8156083226203918, |
| "learning_rate": 8.614529972816787e-06, |
| "loss": 0.4478, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.3190891543893083, |
| "grad_norm": 0.6622930765151978, |
| "learning_rate": 8.608631916624069e-06, |
| "loss": 0.4468, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.31957780546800557, |
| "grad_norm": 1.1308300495147705, |
| "learning_rate": 8.602723361205241e-06, |
| "loss": 0.4467, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.32006645654670285, |
| "grad_norm": 0.8318139314651489, |
| "learning_rate": 8.596804323751098e-06, |
| "loss": 0.4471, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.32055510762540007, |
| "grad_norm": 0.5246617794036865, |
| "learning_rate": 8.590874821482937e-06, |
| "loss": 0.446, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.32104375870409735, |
| "grad_norm": 0.8752800226211548, |
| "learning_rate": 8.584934871652498e-06, |
| "loss": 0.4468, |
| "step": 6570 |
| }, |
| { |
| "epoch": 0.32153240978279457, |
| "grad_norm": 1.248165249824524, |
| "learning_rate": 8.57898449154192e-06, |
| "loss": 0.448, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.32202106086149185, |
| "grad_norm": 1.0610485076904297, |
| "learning_rate": 8.573023698463689e-06, |
| "loss": 0.4468, |
| "step": 6590 |
| }, |
| { |
| "epoch": 0.3225097119401891, |
| "grad_norm": 3.7733728885650635, |
| "learning_rate": 8.567052509760586e-06, |
| "loss": 0.4538, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.32299836301888635, |
| "grad_norm": 3.644801616668701, |
| "learning_rate": 8.561070942805636e-06, |
| "loss": 0.449, |
| "step": 6610 |
| }, |
| { |
| "epoch": 0.3234870140975836, |
| "grad_norm": 0.774163544178009, |
| "learning_rate": 8.555079015002063e-06, |
| "loss": 0.4471, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.3239756651762809, |
| "grad_norm": 1.7043198347091675, |
| "learning_rate": 8.549076743783236e-06, |
| "loss": 0.4474, |
| "step": 6630 |
| }, |
| { |
| "epoch": 0.3244643162549781, |
| "grad_norm": 1.1995218992233276, |
| "learning_rate": 8.543064146612612e-06, |
| "loss": 0.4477, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.3249529673336754, |
| "grad_norm": 1.5275466442108154, |
| "learning_rate": 8.5370412409837e-06, |
| "loss": 0.448, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.3254416184123726, |
| "grad_norm": 0.8573246002197266, |
| "learning_rate": 8.53100804441999e-06, |
| "loss": 0.4474, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.3259302694910699, |
| "grad_norm": 1.1308470964431763, |
| "learning_rate": 8.524964574474925e-06, |
| "loss": 0.4466, |
| "step": 6670 |
| }, |
| { |
| "epoch": 0.3264189205697672, |
| "grad_norm": 1.240512728691101, |
| "learning_rate": 8.51891084873183e-06, |
| "loss": 0.4463, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.3269075716484644, |
| "grad_norm": 2.6846487522125244, |
| "learning_rate": 8.512846884803874e-06, |
| "loss": 0.4476, |
| "step": 6690 |
| }, |
| { |
| "epoch": 0.3273962227271617, |
| "grad_norm": 0.7580792307853699, |
| "learning_rate": 8.506772700334008e-06, |
| "loss": 0.4463, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.3278848738058589, |
| "grad_norm": 0.49652209877967834, |
| "learning_rate": 8.500688312994925e-06, |
| "loss": 0.4471, |
| "step": 6710 |
| }, |
| { |
| "epoch": 0.3283735248845562, |
| "grad_norm": 2.0272531509399414, |
| "learning_rate": 8.494593740489e-06, |
| "loss": 0.4465, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.32886217596325346, |
| "grad_norm": 1.3837034702301025, |
| "learning_rate": 8.488489000548244e-06, |
| "loss": 0.4493, |
| "step": 6730 |
| }, |
| { |
| "epoch": 0.3293508270419507, |
| "grad_norm": 1.1367080211639404, |
| "learning_rate": 8.482374110934246e-06, |
| "loss": 0.4474, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.32983947812064796, |
| "grad_norm": 1.121301531791687, |
| "learning_rate": 8.476249089438129e-06, |
| "loss": 0.4459, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.33032812919934523, |
| "grad_norm": 0.9756953120231628, |
| "learning_rate": 8.470113953880493e-06, |
| "loss": 0.4468, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.33081678027804245, |
| "grad_norm": 1.3827910423278809, |
| "learning_rate": 8.463968722111362e-06, |
| "loss": 0.4473, |
| "step": 6770 |
| }, |
| { |
| "epoch": 0.33130543135673973, |
| "grad_norm": 0.6767109632492065, |
| "learning_rate": 8.45781341201014e-06, |
| "loss": 0.447, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.33179408243543695, |
| "grad_norm": 1.0480477809906006, |
| "learning_rate": 8.451648041485551e-06, |
| "loss": 0.4469, |
| "step": 6790 |
| }, |
| { |
| "epoch": 0.33228273351413423, |
| "grad_norm": 1.5709936618804932, |
| "learning_rate": 8.445472628475588e-06, |
| "loss": 0.4471, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.3327713845928315, |
| "grad_norm": 1.5795131921768188, |
| "learning_rate": 8.439287190947464e-06, |
| "loss": 0.447, |
| "step": 6810 |
| }, |
| { |
| "epoch": 0.33326003567152873, |
| "grad_norm": 1.1700830459594727, |
| "learning_rate": 8.433091746897559e-06, |
| "loss": 0.4455, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.333748686750226, |
| "grad_norm": 1.7184573411941528, |
| "learning_rate": 8.426886314351363e-06, |
| "loss": 0.4458, |
| "step": 6830 |
| }, |
| { |
| "epoch": 0.33423733782892323, |
| "grad_norm": 0.4313448667526245, |
| "learning_rate": 8.420670911363433e-06, |
| "loss": 0.447, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.3347259889076205, |
| "grad_norm": 1.0812926292419434, |
| "learning_rate": 8.41444555601733e-06, |
| "loss": 0.4456, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.3352146399863178, |
| "grad_norm": 1.1345865726470947, |
| "learning_rate": 8.40821026642557e-06, |
| "loss": 0.447, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.335703291065015, |
| "grad_norm": 0.6373735070228577, |
| "learning_rate": 8.401965060729582e-06, |
| "loss": 0.4451, |
| "step": 6870 |
| }, |
| { |
| "epoch": 0.3361919421437123, |
| "grad_norm": 6.616238594055176, |
| "learning_rate": 8.395709957099633e-06, |
| "loss": 0.4475, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.33668059322240956, |
| "grad_norm": 0.9826495051383972, |
| "learning_rate": 8.389444973734797e-06, |
| "loss": 0.4486, |
| "step": 6890 |
| }, |
| { |
| "epoch": 0.3371692443011068, |
| "grad_norm": 1.7973625659942627, |
| "learning_rate": 8.383170128862887e-06, |
| "loss": 0.4473, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.33765789537980406, |
| "grad_norm": 0.9026411175727844, |
| "learning_rate": 8.376885440740414e-06, |
| "loss": 0.4472, |
| "step": 6910 |
| }, |
| { |
| "epoch": 0.3381465464585013, |
| "grad_norm": 0.9952638149261475, |
| "learning_rate": 8.37059092765252e-06, |
| "loss": 0.4461, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.33863519753719856, |
| "grad_norm": 2.210338830947876, |
| "learning_rate": 8.364286607912938e-06, |
| "loss": 0.4487, |
| "step": 6930 |
| }, |
| { |
| "epoch": 0.33912384861589584, |
| "grad_norm": 1.286643385887146, |
| "learning_rate": 8.357972499863933e-06, |
| "loss": 0.4469, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.33961249969459306, |
| "grad_norm": 1.2331130504608154, |
| "learning_rate": 8.351648621876248e-06, |
| "loss": 0.4479, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.34010115077329034, |
| "grad_norm": 0.7784949541091919, |
| "learning_rate": 8.345314992349047e-06, |
| "loss": 0.4468, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.34058980185198756, |
| "grad_norm": 3.558990955352783, |
| "learning_rate": 8.338971629709873e-06, |
| "loss": 0.4455, |
| "step": 6970 |
| }, |
| { |
| "epoch": 0.34107845293068484, |
| "grad_norm": 0.712576150894165, |
| "learning_rate": 8.332618552414585e-06, |
| "loss": 0.4461, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.3415671040093821, |
| "grad_norm": 1.1077570915222168, |
| "learning_rate": 8.326255778947303e-06, |
| "loss": 0.4453, |
| "step": 6990 |
| }, |
| { |
| "epoch": 0.34205575508807934, |
| "grad_norm": 1.3067269325256348, |
| "learning_rate": 8.319883327820363e-06, |
| "loss": 0.4462, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.34205575508807934, |
| "eval_loss": 0.4191921055316925, |
| "eval_runtime": 728.4719, |
| "eval_samples_per_second": 242.848, |
| "eval_steps_per_second": 0.475, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3425444061667766, |
| "grad_norm": 1.001678705215454, |
| "learning_rate": 8.313501217574253e-06, |
| "loss": 0.4465, |
| "step": 7010 |
| }, |
| { |
| "epoch": 0.3430330572454739, |
| "grad_norm": 0.7304960489273071, |
| "learning_rate": 8.307109466777567e-06, |
| "loss": 0.4458, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.3435217083241711, |
| "grad_norm": 0.7707636952400208, |
| "learning_rate": 8.30070809402695e-06, |
| "loss": 0.4441, |
| "step": 7030 |
| }, |
| { |
| "epoch": 0.3440103594028684, |
| "grad_norm": 0.9046769142150879, |
| "learning_rate": 8.294297117947035e-06, |
| "loss": 0.4445, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.3444990104815656, |
| "grad_norm": 0.8245752453804016, |
| "learning_rate": 8.287876557190402e-06, |
| "loss": 0.444, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.3449876615602629, |
| "grad_norm": 1.746430516242981, |
| "learning_rate": 8.281446430437516e-06, |
| "loss": 0.4469, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.34547631263896017, |
| "grad_norm": 1.3313848972320557, |
| "learning_rate": 8.27500675639667e-06, |
| "loss": 0.4473, |
| "step": 7070 |
| }, |
| { |
| "epoch": 0.3459649637176574, |
| "grad_norm": 1.182501196861267, |
| "learning_rate": 8.26855755380394e-06, |
| "loss": 0.4453, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.34645361479635467, |
| "grad_norm": 2.6568055152893066, |
| "learning_rate": 8.262098841423126e-06, |
| "loss": 0.4462, |
| "step": 7090 |
| }, |
| { |
| "epoch": 0.3469422658750519, |
| "grad_norm": 1.4778715372085571, |
| "learning_rate": 8.255630638045685e-06, |
| "loss": 0.4463, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.34743091695374917, |
| "grad_norm": 1.463995099067688, |
| "learning_rate": 8.249152962490705e-06, |
| "loss": 0.4468, |
| "step": 7110 |
| }, |
| { |
| "epoch": 0.34791956803244645, |
| "grad_norm": 0.9242321848869324, |
| "learning_rate": 8.242665833604818e-06, |
| "loss": 0.446, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.34840821911114367, |
| "grad_norm": 0.8648793697357178, |
| "learning_rate": 8.236169270262168e-06, |
| "loss": 0.4447, |
| "step": 7130 |
| }, |
| { |
| "epoch": 0.34889687018984095, |
| "grad_norm": 0.7932630777359009, |
| "learning_rate": 8.229663291364349e-06, |
| "loss": 0.4458, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.3493855212685382, |
| "grad_norm": 2.303868055343628, |
| "learning_rate": 8.223147915840347e-06, |
| "loss": 0.446, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.34987417234723545, |
| "grad_norm": 0.47625330090522766, |
| "learning_rate": 8.216623162646487e-06, |
| "loss": 0.4469, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.3503628234259327, |
| "grad_norm": 0.5169132947921753, |
| "learning_rate": 8.210089050766374e-06, |
| "loss": 0.4461, |
| "step": 7170 |
| }, |
| { |
| "epoch": 0.35085147450462995, |
| "grad_norm": 1.1093195676803589, |
| "learning_rate": 8.203545599210851e-06, |
| "loss": 0.4457, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.3513401255833272, |
| "grad_norm": 1.9182569980621338, |
| "learning_rate": 8.19699282701793e-06, |
| "loss": 0.4453, |
| "step": 7190 |
| }, |
| { |
| "epoch": 0.3518287766620245, |
| "grad_norm": 0.5894930958747864, |
| "learning_rate": 8.190430753252742e-06, |
| "loss": 0.4462, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.3523174277407217, |
| "grad_norm": 1.633952260017395, |
| "learning_rate": 8.183859397007476e-06, |
| "loss": 0.4446, |
| "step": 7210 |
| }, |
| { |
| "epoch": 0.352806078819419, |
| "grad_norm": 1.9727741479873657, |
| "learning_rate": 8.177278777401332e-06, |
| "loss": 0.448, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.3532947298981163, |
| "grad_norm": 1.4541544914245605, |
| "learning_rate": 8.170688913580465e-06, |
| "loss": 0.4474, |
| "step": 7230 |
| }, |
| { |
| "epoch": 0.3537833809768135, |
| "grad_norm": 2.3945956230163574, |
| "learning_rate": 8.16408982471792e-06, |
| "loss": 0.4456, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.3542720320555108, |
| "grad_norm": 0.821062445640564, |
| "learning_rate": 8.157481530013586e-06, |
| "loss": 0.4459, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.354760683134208, |
| "grad_norm": 0.6615464687347412, |
| "learning_rate": 8.150864048694132e-06, |
| "loss": 0.4458, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.3552493342129053, |
| "grad_norm": 0.6758638620376587, |
| "learning_rate": 8.14423740001296e-06, |
| "loss": 0.4441, |
| "step": 7270 |
| }, |
| { |
| "epoch": 0.35573798529160255, |
| "grad_norm": 1.2416491508483887, |
| "learning_rate": 8.137601603250139e-06, |
| "loss": 0.4454, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.3562266363702998, |
| "grad_norm": 0.828959584236145, |
| "learning_rate": 8.13095667771236e-06, |
| "loss": 0.4444, |
| "step": 7290 |
| }, |
| { |
| "epoch": 0.35671528744899705, |
| "grad_norm": 0.5700317025184631, |
| "learning_rate": 8.124302642732871e-06, |
| "loss": 0.4459, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.3572039385276943, |
| "grad_norm": 0.6910264492034912, |
| "learning_rate": 8.117639517671421e-06, |
| "loss": 0.4446, |
| "step": 7310 |
| }, |
| { |
| "epoch": 0.35769258960639155, |
| "grad_norm": 1.0732626914978027, |
| "learning_rate": 8.11096732191421e-06, |
| "loss": 0.4457, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.35818124068508883, |
| "grad_norm": 0.9882492423057556, |
| "learning_rate": 8.10428607487383e-06, |
| "loss": 0.445, |
| "step": 7330 |
| }, |
| { |
| "epoch": 0.35866989176378605, |
| "grad_norm": 0.5441588163375854, |
| "learning_rate": 8.097595795989203e-06, |
| "loss": 0.4453, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.35915854284248333, |
| "grad_norm": 0.8513416647911072, |
| "learning_rate": 8.090896504725534e-06, |
| "loss": 0.4455, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.3596471939211806, |
| "grad_norm": 0.5936821103096008, |
| "learning_rate": 8.084188220574244e-06, |
| "loss": 0.444, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.36013584499987783, |
| "grad_norm": 4.0613017082214355, |
| "learning_rate": 8.077470963052922e-06, |
| "loss": 0.447, |
| "step": 7370 |
| }, |
| { |
| "epoch": 0.3606244960785751, |
| "grad_norm": 0.7625659704208374, |
| "learning_rate": 8.070744751705267e-06, |
| "loss": 0.4463, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.36111314715727233, |
| "grad_norm": 0.8564379811286926, |
| "learning_rate": 8.064009606101023e-06, |
| "loss": 0.4452, |
| "step": 7390 |
| }, |
| { |
| "epoch": 0.3616017982359696, |
| "grad_norm": 0.671668291091919, |
| "learning_rate": 8.05726554583593e-06, |
| "loss": 0.4458, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.3620904493146669, |
| "grad_norm": 1.2709118127822876, |
| "learning_rate": 8.050512590531669e-06, |
| "loss": 0.4454, |
| "step": 7410 |
| }, |
| { |
| "epoch": 0.3625791003933641, |
| "grad_norm": 0.7745212912559509, |
| "learning_rate": 8.043750759835795e-06, |
| "loss": 0.446, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.3630677514720614, |
| "grad_norm": 0.7901990413665771, |
| "learning_rate": 8.036980073421693e-06, |
| "loss": 0.4444, |
| "step": 7430 |
| }, |
| { |
| "epoch": 0.3635564025507586, |
| "grad_norm": 1.0258527994155884, |
| "learning_rate": 8.030200550988505e-06, |
| "loss": 0.4437, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.3640450536294559, |
| "grad_norm": 1.6445204019546509, |
| "learning_rate": 8.023412212261088e-06, |
| "loss": 0.444, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.36453370470815316, |
| "grad_norm": 1.1179972887039185, |
| "learning_rate": 8.016615076989947e-06, |
| "loss": 0.4449, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.3650223557868504, |
| "grad_norm": 0.4461180567741394, |
| "learning_rate": 8.009809164951176e-06, |
| "loss": 0.4446, |
| "step": 7470 |
| }, |
| { |
| "epoch": 0.36551100686554766, |
| "grad_norm": 0.6667689681053162, |
| "learning_rate": 8.002994495946415e-06, |
| "loss": 0.4443, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.36599965794424494, |
| "grad_norm": 0.691374659538269, |
| "learning_rate": 7.996171089802774e-06, |
| "loss": 0.4445, |
| "step": 7490 |
| }, |
| { |
| "epoch": 0.36648830902294216, |
| "grad_norm": 1.3462163209915161, |
| "learning_rate": 7.989338966372787e-06, |
| "loss": 0.4431, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.36648830902294216, |
| "eval_loss": 0.4194032549858093, |
| "eval_runtime": 728.4338, |
| "eval_samples_per_second": 242.861, |
| "eval_steps_per_second": 0.475, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.36697696010163944, |
| "grad_norm": 1.0293834209442139, |
| "learning_rate": 7.982498145534348e-06, |
| "loss": 0.4454, |
| "step": 7510 |
| }, |
| { |
| "epoch": 0.36746561118033666, |
| "grad_norm": 1.0880999565124512, |
| "learning_rate": 7.97564864719066e-06, |
| "loss": 0.4435, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.36795426225903394, |
| "grad_norm": 3.1764519214630127, |
| "learning_rate": 7.968790491270165e-06, |
| "loss": 0.4451, |
| "step": 7530 |
| }, |
| { |
| "epoch": 0.3684429133377312, |
| "grad_norm": 0.6520982980728149, |
| "learning_rate": 7.961923697726506e-06, |
| "loss": 0.4464, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.36893156441642844, |
| "grad_norm": 1.566203236579895, |
| "learning_rate": 7.955048286538448e-06, |
| "loss": 0.4455, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.3694202154951257, |
| "grad_norm": 1.396600365638733, |
| "learning_rate": 7.948164277709831e-06, |
| "loss": 0.4466, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.36990886657382294, |
| "grad_norm": 39.281192779541016, |
| "learning_rate": 7.941271691269511e-06, |
| "loss": 0.4899, |
| "step": 7570 |
| }, |
| { |
| "epoch": 0.3703975176525202, |
| "grad_norm": 2.0359652042388916, |
| "learning_rate": 7.934370547271297e-06, |
| "loss": 0.4587, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.3708861687312175, |
| "grad_norm": 0.7175349593162537, |
| "learning_rate": 7.9274608657939e-06, |
| "loss": 0.4484, |
| "step": 7590 |
| }, |
| { |
| "epoch": 0.3713748198099147, |
| "grad_norm": 1.1124777793884277, |
| "learning_rate": 7.920542666940871e-06, |
| "loss": 0.4465, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.371863470888612, |
| "grad_norm": 1.0177866220474243, |
| "learning_rate": 7.913615970840535e-06, |
| "loss": 0.4447, |
| "step": 7610 |
| }, |
| { |
| "epoch": 0.37235212196730927, |
| "grad_norm": 0.7671780586242676, |
| "learning_rate": 7.90668079764595e-06, |
| "loss": 0.4455, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.3728407730460065, |
| "grad_norm": 1.171650767326355, |
| "learning_rate": 7.899737167534827e-06, |
| "loss": 0.4456, |
| "step": 7630 |
| }, |
| { |
| "epoch": 0.37332942412470377, |
| "grad_norm": 0.5443609356880188, |
| "learning_rate": 7.892785100709492e-06, |
| "loss": 0.4461, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.373818075203401, |
| "grad_norm": 1.2549580335617065, |
| "learning_rate": 7.885824617396812e-06, |
| "loss": 0.4451, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.37430672628209827, |
| "grad_norm": 0.7662185430526733, |
| "learning_rate": 7.878855737848139e-06, |
| "loss": 0.4446, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.37479537736079555, |
| "grad_norm": 1.3419959545135498, |
| "learning_rate": 7.871878482339264e-06, |
| "loss": 0.4468, |
| "step": 7670 |
| }, |
| { |
| "epoch": 0.37528402843949277, |
| "grad_norm": 1.2521858215332031, |
| "learning_rate": 7.864892871170335e-06, |
| "loss": 0.4451, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.37577267951819004, |
| "grad_norm": 2.5343024730682373, |
| "learning_rate": 7.857898924665817e-06, |
| "loss": 0.4458, |
| "step": 7690 |
| }, |
| { |
| "epoch": 0.37626133059688727, |
| "grad_norm": 0.9986534118652344, |
| "learning_rate": 7.85089666317443e-06, |
| "loss": 0.4451, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.37674998167558454, |
| "grad_norm": 0.8709741830825806, |
| "learning_rate": 7.843886107069077e-06, |
| "loss": 0.4439, |
| "step": 7710 |
| }, |
| { |
| "epoch": 0.3772386327542818, |
| "grad_norm": 0.8361919522285461, |
| "learning_rate": 7.836867276746805e-06, |
| "loss": 0.4444, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.37772728383297904, |
| "grad_norm": 1.1930742263793945, |
| "learning_rate": 7.829840192628723e-06, |
| "loss": 0.4461, |
| "step": 7730 |
| }, |
| { |
| "epoch": 0.3782159349116763, |
| "grad_norm": 1.6097028255462646, |
| "learning_rate": 7.822804875159962e-06, |
| "loss": 0.4444, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.3787045859903736, |
| "grad_norm": 0.6868306994438171, |
| "learning_rate": 7.815761344809609e-06, |
| "loss": 0.4457, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.3791932370690708, |
| "grad_norm": 0.5000033974647522, |
| "learning_rate": 7.808709622070639e-06, |
| "loss": 0.4449, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.3796818881477681, |
| "grad_norm": 0.3964043855667114, |
| "learning_rate": 7.801649727459868e-06, |
| "loss": 0.4439, |
| "step": 7770 |
| }, |
| { |
| "epoch": 0.3801705392264653, |
| "grad_norm": 1.3012721538543701, |
| "learning_rate": 7.794581681517886e-06, |
| "loss": 0.4454, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.3806591903051626, |
| "grad_norm": 0.6892145276069641, |
| "learning_rate": 7.787505504808997e-06, |
| "loss": 0.4456, |
| "step": 7790 |
| }, |
| { |
| "epoch": 0.3811478413838599, |
| "grad_norm": 0.48608964681625366, |
| "learning_rate": 7.780421217921169e-06, |
| "loss": 0.4439, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.3816364924625571, |
| "grad_norm": 0.7753750085830688, |
| "learning_rate": 7.773328841465958e-06, |
| "loss": 0.4438, |
| "step": 7810 |
| }, |
| { |
| "epoch": 0.3821251435412544, |
| "grad_norm": 0.5739250183105469, |
| "learning_rate": 7.766228396078458e-06, |
| "loss": 0.4444, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.3826137946199516, |
| "grad_norm": 0.6620212197303772, |
| "learning_rate": 7.759119902417244e-06, |
| "loss": 0.445, |
| "step": 7830 |
| }, |
| { |
| "epoch": 0.3831024456986489, |
| "grad_norm": 0.5474065542221069, |
| "learning_rate": 7.7520033811643e-06, |
| "loss": 0.4436, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.38359109677734615, |
| "grad_norm": 1.7903695106506348, |
| "learning_rate": 7.744878853024976e-06, |
| "loss": 0.444, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.3840797478560434, |
| "grad_norm": 0.9528830051422119, |
| "learning_rate": 7.737746338727908e-06, |
| "loss": 0.4436, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.38456839893474065, |
| "grad_norm": 0.9075807332992554, |
| "learning_rate": 7.730605859024971e-06, |
| "loss": 0.4433, |
| "step": 7870 |
| }, |
| { |
| "epoch": 0.38505705001343793, |
| "grad_norm": 1.1544967889785767, |
| "learning_rate": 7.723457434691216e-06, |
| "loss": 0.4456, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.38554570109213515, |
| "grad_norm": 1.7026115655899048, |
| "learning_rate": 7.71630108652481e-06, |
| "loss": 0.4458, |
| "step": 7890 |
| }, |
| { |
| "epoch": 0.38603435217083243, |
| "grad_norm": 0.6825501918792725, |
| "learning_rate": 7.709136835346973e-06, |
| "loss": 0.4447, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.38652300324952965, |
| "grad_norm": 1.6804189682006836, |
| "learning_rate": 7.701964702001916e-06, |
| "loss": 0.4446, |
| "step": 7910 |
| }, |
| { |
| "epoch": 0.38701165432822693, |
| "grad_norm": 3.464137077331543, |
| "learning_rate": 7.694784707356786e-06, |
| "loss": 0.4467, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.3875003054069242, |
| "grad_norm": 0.6467346549034119, |
| "learning_rate": 7.687596872301603e-06, |
| "loss": 0.4446, |
| "step": 7930 |
| }, |
| { |
| "epoch": 0.38798895648562143, |
| "grad_norm": 1.6307556629180908, |
| "learning_rate": 7.680401217749194e-06, |
| "loss": 0.4454, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.3884776075643187, |
| "grad_norm": 1.3172680139541626, |
| "learning_rate": 7.67319776463514e-06, |
| "loss": 0.447, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.3889662586430159, |
| "grad_norm": 0.94371497631073, |
| "learning_rate": 7.665986533917715e-06, |
| "loss": 0.4443, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.3894549097217132, |
| "grad_norm": 1.032759666442871, |
| "learning_rate": 7.658767546577815e-06, |
| "loss": 0.4435, |
| "step": 7970 |
| }, |
| { |
| "epoch": 0.3899435608004105, |
| "grad_norm": 0.6555205583572388, |
| "learning_rate": 7.651540823618906e-06, |
| "loss": 0.4456, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.3904322118791077, |
| "grad_norm": 0.8276070952415466, |
| "learning_rate": 7.644306386066964e-06, |
| "loss": 0.4437, |
| "step": 7990 |
| }, |
| { |
| "epoch": 0.390920862957805, |
| "grad_norm": 0.9051567912101746, |
| "learning_rate": 7.637064254970404e-06, |
| "loss": 0.4439, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.390920862957805, |
| "eval_loss": 0.41898027062416077, |
| "eval_runtime": 729.9138, |
| "eval_samples_per_second": 242.368, |
| "eval_steps_per_second": 0.474, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.39140951403650226, |
| "grad_norm": 0.7855016589164734, |
| "learning_rate": 7.629814451400034e-06, |
| "loss": 0.4434, |
| "step": 8010 |
| }, |
| { |
| "epoch": 0.3918981651151995, |
| "grad_norm": 1.8473398685455322, |
| "learning_rate": 7.622556996448973e-06, |
| "loss": 0.4441, |
| "step": 8020 |
| }, |
| { |
| "epoch": 0.39238681619389676, |
| "grad_norm": 1.2307816743850708, |
| "learning_rate": 7.615291911232614e-06, |
| "loss": 0.4426, |
| "step": 8030 |
| }, |
| { |
| "epoch": 0.392875467272594, |
| "grad_norm": 0.9610106945037842, |
| "learning_rate": 7.6080192168885436e-06, |
| "loss": 0.4439, |
| "step": 8040 |
| }, |
| { |
| "epoch": 0.39336411835129126, |
| "grad_norm": 0.8011897206306458, |
| "learning_rate": 7.600738934576484e-06, |
| "loss": 0.4424, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.39385276942998854, |
| "grad_norm": 0.9333787560462952, |
| "learning_rate": 7.593451085478243e-06, |
| "loss": 0.443, |
| "step": 8060 |
| }, |
| { |
| "epoch": 0.39434142050868576, |
| "grad_norm": 0.5144811868667603, |
| "learning_rate": 7.586155690797636e-06, |
| "loss": 0.4446, |
| "step": 8070 |
| }, |
| { |
| "epoch": 0.39483007158738304, |
| "grad_norm": 1.6834224462509155, |
| "learning_rate": 7.578852771760437e-06, |
| "loss": 0.4443, |
| "step": 8080 |
| }, |
| { |
| "epoch": 0.39531872266608026, |
| "grad_norm": 1.0620421171188354, |
| "learning_rate": 7.571542349614307e-06, |
| "loss": 0.4436, |
| "step": 8090 |
| }, |
| { |
| "epoch": 0.39580737374477754, |
| "grad_norm": 0.8550513386726379, |
| "learning_rate": 7.564224445628741e-06, |
| "loss": 0.4439, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.3962960248234748, |
| "grad_norm": 0.5044734477996826, |
| "learning_rate": 7.556899081095004e-06, |
| "loss": 0.4446, |
| "step": 8110 |
| }, |
| { |
| "epoch": 0.39678467590217203, |
| "grad_norm": 0.8119836449623108, |
| "learning_rate": 7.549566277326061e-06, |
| "loss": 0.4438, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.3972733269808693, |
| "grad_norm": 10.883358001708984, |
| "learning_rate": 7.542226055656527e-06, |
| "loss": 0.4461, |
| "step": 8130 |
| }, |
| { |
| "epoch": 0.3977619780595666, |
| "grad_norm": 1.7727267742156982, |
| "learning_rate": 7.534878437442597e-06, |
| "loss": 0.4482, |
| "step": 8140 |
| }, |
| { |
| "epoch": 0.3982506291382638, |
| "grad_norm": 1.0288087129592896, |
| "learning_rate": 7.527523444061984e-06, |
| "loss": 0.4443, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.3987392802169611, |
| "grad_norm": 1.184952974319458, |
| "learning_rate": 7.520161096913863e-06, |
| "loss": 0.4466, |
| "step": 8160 |
| }, |
| { |
| "epoch": 0.3992279312956583, |
| "grad_norm": 0.9457073211669922, |
| "learning_rate": 7.512791417418802e-06, |
| "loss": 0.4454, |
| "step": 8170 |
| }, |
| { |
| "epoch": 0.3997165823743556, |
| "grad_norm": 0.771334171295166, |
| "learning_rate": 7.505414427018704e-06, |
| "loss": 0.445, |
| "step": 8180 |
| }, |
| { |
| "epoch": 0.40020523345305287, |
| "grad_norm": 1.0723953247070312, |
| "learning_rate": 7.4980301471767404e-06, |
| "loss": 0.4449, |
| "step": 8190 |
| }, |
| { |
| "epoch": 0.4006938845317501, |
| "grad_norm": 0.9210856556892395, |
| "learning_rate": 7.490638599377291e-06, |
| "loss": 0.4432, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.40118253561044737, |
| "grad_norm": 0.8094615340232849, |
| "learning_rate": 7.483239805125886e-06, |
| "loss": 0.4443, |
| "step": 8210 |
| }, |
| { |
| "epoch": 0.40167118668914464, |
| "grad_norm": 1.3815480470657349, |
| "learning_rate": 7.475833785949134e-06, |
| "loss": 0.4431, |
| "step": 8220 |
| }, |
| { |
| "epoch": 0.40215983776784187, |
| "grad_norm": 1.4028229713439941, |
| "learning_rate": 7.468420563394667e-06, |
| "loss": 0.4449, |
| "step": 8230 |
| }, |
| { |
| "epoch": 0.40264848884653914, |
| "grad_norm": 0.7880713939666748, |
| "learning_rate": 7.461000159031073e-06, |
| "loss": 0.4444, |
| "step": 8240 |
| }, |
| { |
| "epoch": 0.40313713992523637, |
| "grad_norm": 0.573472797870636, |
| "learning_rate": 7.45357259444784e-06, |
| "loss": 0.4432, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.40362579100393364, |
| "grad_norm": 1.1918740272521973, |
| "learning_rate": 7.4461378912552806e-06, |
| "loss": 0.4428, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.4041144420826309, |
| "grad_norm": 0.6638442277908325, |
| "learning_rate": 7.438696071084483e-06, |
| "loss": 0.4447, |
| "step": 8270 |
| }, |
| { |
| "epoch": 0.40460309316132814, |
| "grad_norm": 1.2030208110809326, |
| "learning_rate": 7.431247155587243e-06, |
| "loss": 0.4436, |
| "step": 8280 |
| }, |
| { |
| "epoch": 0.4050917442400254, |
| "grad_norm": 0.3726930320262909, |
| "learning_rate": 7.423791166435997e-06, |
| "loss": 0.4433, |
| "step": 8290 |
| }, |
| { |
| "epoch": 0.40558039531872264, |
| "grad_norm": 0.8080679178237915, |
| "learning_rate": 7.4163281253237604e-06, |
| "loss": 0.4437, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.4060690463974199, |
| "grad_norm": 0.7469872832298279, |
| "learning_rate": 7.40885805396407e-06, |
| "loss": 0.4427, |
| "step": 8310 |
| }, |
| { |
| "epoch": 0.4065576974761172, |
| "grad_norm": 1.38739812374115, |
| "learning_rate": 7.4013809740909135e-06, |
| "loss": 0.443, |
| "step": 8320 |
| }, |
| { |
| "epoch": 0.4070463485548144, |
| "grad_norm": 0.823733389377594, |
| "learning_rate": 7.393896907458674e-06, |
| "loss": 0.4427, |
| "step": 8330 |
| }, |
| { |
| "epoch": 0.4075349996335117, |
| "grad_norm": 0.47151875495910645, |
| "learning_rate": 7.3864058758420595e-06, |
| "loss": 0.445, |
| "step": 8340 |
| }, |
| { |
| "epoch": 0.408023650712209, |
| "grad_norm": 0.34016215801239014, |
| "learning_rate": 7.378907901036042e-06, |
| "loss": 0.4437, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.4085123017909062, |
| "grad_norm": 0.9797572493553162, |
| "learning_rate": 7.3714030048557935e-06, |
| "loss": 0.4431, |
| "step": 8360 |
| }, |
| { |
| "epoch": 0.4090009528696035, |
| "grad_norm": 0.8803391456604004, |
| "learning_rate": 7.363891209136631e-06, |
| "loss": 0.4431, |
| "step": 8370 |
| }, |
| { |
| "epoch": 0.4094896039483007, |
| "grad_norm": 0.9852266907691956, |
| "learning_rate": 7.356372535733934e-06, |
| "loss": 0.443, |
| "step": 8380 |
| }, |
| { |
| "epoch": 0.409978255026998, |
| "grad_norm": 1.409609317779541, |
| "learning_rate": 7.348847006523103e-06, |
| "loss": 0.4447, |
| "step": 8390 |
| }, |
| { |
| "epoch": 0.41046690610569525, |
| "grad_norm": 0.47717586159706116, |
| "learning_rate": 7.341314643399479e-06, |
| "loss": 0.4443, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.4109555571843925, |
| "grad_norm": 0.3413306176662445, |
| "learning_rate": 7.333775468278285e-06, |
| "loss": 0.443, |
| "step": 8410 |
| }, |
| { |
| "epoch": 0.41144420826308975, |
| "grad_norm": 0.5356876254081726, |
| "learning_rate": 7.326229503094573e-06, |
| "loss": 0.4429, |
| "step": 8420 |
| }, |
| { |
| "epoch": 0.41193285934178697, |
| "grad_norm": 0.5036433339118958, |
| "learning_rate": 7.318676769803137e-06, |
| "loss": 0.4441, |
| "step": 8430 |
| }, |
| { |
| "epoch": 0.41242151042048425, |
| "grad_norm": 0.9086324572563171, |
| "learning_rate": 7.311117290378473e-06, |
| "loss": 0.4431, |
| "step": 8440 |
| }, |
| { |
| "epoch": 0.4129101614991815, |
| "grad_norm": 0.827485203742981, |
| "learning_rate": 7.303551086814702e-06, |
| "loss": 0.4428, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.41339881257787875, |
| "grad_norm": 1.1920230388641357, |
| "learning_rate": 7.295978181125503e-06, |
| "loss": 0.445, |
| "step": 8460 |
| }, |
| { |
| "epoch": 0.413887463656576, |
| "grad_norm": 0.9056548476219177, |
| "learning_rate": 7.2883985953440636e-06, |
| "loss": 0.4442, |
| "step": 8470 |
| }, |
| { |
| "epoch": 0.4143761147352733, |
| "grad_norm": 0.5254775881767273, |
| "learning_rate": 7.280812351523003e-06, |
| "loss": 0.4432, |
| "step": 8480 |
| }, |
| { |
| "epoch": 0.4148647658139705, |
| "grad_norm": 0.6151171922683716, |
| "learning_rate": 7.27321947173431e-06, |
| "loss": 0.4442, |
| "step": 8490 |
| }, |
| { |
| "epoch": 0.4153534168926678, |
| "grad_norm": 0.3920780420303345, |
| "learning_rate": 7.265619978069281e-06, |
| "loss": 0.4432, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.4153534168926678, |
| "eval_loss": 0.41748544573783875, |
| "eval_runtime": 729.7588, |
| "eval_samples_per_second": 242.42, |
| "eval_steps_per_second": 0.474, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.415842067971365, |
| "grad_norm": 0.5901490449905396, |
| "learning_rate": 7.25801389263846e-06, |
| "loss": 0.4442, |
| "step": 8510 |
| }, |
| { |
| "epoch": 0.4163307190500623, |
| "grad_norm": 0.5799441337585449, |
| "learning_rate": 7.2504012375715645e-06, |
| "loss": 0.4427, |
| "step": 8520 |
| }, |
| { |
| "epoch": 0.4168193701287596, |
| "grad_norm": 0.9592375755310059, |
| "learning_rate": 7.242782035017428e-06, |
| "loss": 0.4439, |
| "step": 8530 |
| }, |
| { |
| "epoch": 0.4173080212074568, |
| "grad_norm": 0.6781924962997437, |
| "learning_rate": 7.235156307143933e-06, |
| "loss": 0.4429, |
| "step": 8540 |
| }, |
| { |
| "epoch": 0.4177966722861541, |
| "grad_norm": 0.37766560912132263, |
| "learning_rate": 7.2275240761379464e-06, |
| "loss": 0.4422, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.4182853233648513, |
| "grad_norm": 1.2287683486938477, |
| "learning_rate": 7.2198853642052615e-06, |
| "loss": 0.4426, |
| "step": 8560 |
| }, |
| { |
| "epoch": 0.4187739744435486, |
| "grad_norm": 0.9670842289924622, |
| "learning_rate": 7.212240193570519e-06, |
| "loss": 0.4434, |
| "step": 8570 |
| }, |
| { |
| "epoch": 0.41926262552224586, |
| "grad_norm": 0.5393080115318298, |
| "learning_rate": 7.204588586477157e-06, |
| "loss": 0.4433, |
| "step": 8580 |
| }, |
| { |
| "epoch": 0.4197512766009431, |
| "grad_norm": 0.5459208488464355, |
| "learning_rate": 7.196930565187341e-06, |
| "loss": 0.4433, |
| "step": 8590 |
| }, |
| { |
| "epoch": 0.42023992767964036, |
| "grad_norm": 0.8376490473747253, |
| "learning_rate": 7.189266151981893e-06, |
| "loss": 0.4424, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.42072857875833763, |
| "grad_norm": 3.4486372470855713, |
| "learning_rate": 7.181595369160237e-06, |
| "loss": 0.4425, |
| "step": 8610 |
| }, |
| { |
| "epoch": 0.42121722983703486, |
| "grad_norm": 2.3472955226898193, |
| "learning_rate": 7.173918239040329e-06, |
| "loss": 0.445, |
| "step": 8620 |
| }, |
| { |
| "epoch": 0.42170588091573213, |
| "grad_norm": 2.3312840461730957, |
| "learning_rate": 7.166234783958587e-06, |
| "loss": 0.4447, |
| "step": 8630 |
| }, |
| { |
| "epoch": 0.42219453199442936, |
| "grad_norm": 0.7450709342956543, |
| "learning_rate": 7.158545026269838e-06, |
| "loss": 0.4438, |
| "step": 8640 |
| }, |
| { |
| "epoch": 0.42268318307312663, |
| "grad_norm": 1.204588532447815, |
| "learning_rate": 7.150848988347244e-06, |
| "loss": 0.4441, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.4231718341518239, |
| "grad_norm": 0.7559615969657898, |
| "learning_rate": 7.143146692582237e-06, |
| "loss": 0.4423, |
| "step": 8660 |
| }, |
| { |
| "epoch": 0.42366048523052113, |
| "grad_norm": 1.6019837856292725, |
| "learning_rate": 7.135438161384458e-06, |
| "loss": 0.4436, |
| "step": 8670 |
| }, |
| { |
| "epoch": 0.4241491363092184, |
| "grad_norm": 1.278933048248291, |
| "learning_rate": 7.127723417181691e-06, |
| "loss": 0.4429, |
| "step": 8680 |
| }, |
| { |
| "epoch": 0.42463778738791563, |
| "grad_norm": 0.6044679284095764, |
| "learning_rate": 7.1200024824197945e-06, |
| "loss": 0.442, |
| "step": 8690 |
| }, |
| { |
| "epoch": 0.4251264384666129, |
| "grad_norm": 0.771743655204773, |
| "learning_rate": 7.1122753795626385e-06, |
| "loss": 0.4429, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.4256150895453102, |
| "grad_norm": 1.0729281902313232, |
| "learning_rate": 7.1045421310920386e-06, |
| "loss": 0.4436, |
| "step": 8710 |
| }, |
| { |
| "epoch": 0.4261037406240074, |
| "grad_norm": 0.48893994092941284, |
| "learning_rate": 7.096802759507693e-06, |
| "loss": 0.4427, |
| "step": 8720 |
| }, |
| { |
| "epoch": 0.4265923917027047, |
| "grad_norm": 0.5487367510795593, |
| "learning_rate": 7.0890572873271125e-06, |
| "loss": 0.4435, |
| "step": 8730 |
| }, |
| { |
| "epoch": 0.42708104278140196, |
| "grad_norm": 0.39890584349632263, |
| "learning_rate": 7.08130573708556e-06, |
| "loss": 0.4427, |
| "step": 8740 |
| }, |
| { |
| "epoch": 0.4275696938600992, |
| "grad_norm": 0.437925785779953, |
| "learning_rate": 7.07354813133598e-06, |
| "loss": 0.4423, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.42805834493879646, |
| "grad_norm": 1.0761085748672485, |
| "learning_rate": 7.065784492648937e-06, |
| "loss": 0.4447, |
| "step": 8760 |
| }, |
| { |
| "epoch": 0.4285469960174937, |
| "grad_norm": 0.6409640312194824, |
| "learning_rate": 7.058014843612546e-06, |
| "loss": 0.4432, |
| "step": 8770 |
| }, |
| { |
| "epoch": 0.42903564709619096, |
| "grad_norm": 0.8142459988594055, |
| "learning_rate": 7.050239206832412e-06, |
| "loss": 0.4431, |
| "step": 8780 |
| }, |
| { |
| "epoch": 0.42952429817488824, |
| "grad_norm": 0.7957897782325745, |
| "learning_rate": 7.042457604931558e-06, |
| "loss": 0.4427, |
| "step": 8790 |
| }, |
| { |
| "epoch": 0.43001294925358546, |
| "grad_norm": 0.8293124437332153, |
| "learning_rate": 7.034670060550367e-06, |
| "loss": 0.4425, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.43050160033228274, |
| "grad_norm": 0.3750956654548645, |
| "learning_rate": 7.026876596346505e-06, |
| "loss": 0.4416, |
| "step": 8810 |
| }, |
| { |
| "epoch": 0.43099025141097996, |
| "grad_norm": 0.755920946598053, |
| "learning_rate": 7.019077234994865e-06, |
| "loss": 0.443, |
| "step": 8820 |
| }, |
| { |
| "epoch": 0.43147890248967724, |
| "grad_norm": 0.6560993194580078, |
| "learning_rate": 7.0112719991875025e-06, |
| "loss": 0.443, |
| "step": 8830 |
| }, |
| { |
| "epoch": 0.4319675535683745, |
| "grad_norm": 0.3859688341617584, |
| "learning_rate": 7.003460911633555e-06, |
| "loss": 0.443, |
| "step": 8840 |
| }, |
| { |
| "epoch": 0.43245620464707174, |
| "grad_norm": 0.6885735988616943, |
| "learning_rate": 6.9956439950591915e-06, |
| "loss": 0.4418, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.432944855725769, |
| "grad_norm": 1.1823225021362305, |
| "learning_rate": 6.98782127220754e-06, |
| "loss": 0.4433, |
| "step": 8860 |
| }, |
| { |
| "epoch": 0.4334335068044663, |
| "grad_norm": 0.9184996485710144, |
| "learning_rate": 6.979992765838619e-06, |
| "loss": 0.4439, |
| "step": 8870 |
| }, |
| { |
| "epoch": 0.4339221578831635, |
| "grad_norm": 0.6856487989425659, |
| "learning_rate": 6.97215849872928e-06, |
| "loss": 0.4431, |
| "step": 8880 |
| }, |
| { |
| "epoch": 0.4344108089618608, |
| "grad_norm": 0.4063749611377716, |
| "learning_rate": 6.964318493673126e-06, |
| "loss": 0.4435, |
| "step": 8890 |
| }, |
| { |
| "epoch": 0.434899460040558, |
| "grad_norm": 1.1154191493988037, |
| "learning_rate": 6.956472773480463e-06, |
| "loss": 0.4435, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.4353881111192553, |
| "grad_norm": 0.4631388485431671, |
| "learning_rate": 6.948621360978221e-06, |
| "loss": 0.4424, |
| "step": 8910 |
| }, |
| { |
| "epoch": 0.43587676219795257, |
| "grad_norm": 0.6873944997787476, |
| "learning_rate": 6.94076427900989e-06, |
| "loss": 0.443, |
| "step": 8920 |
| }, |
| { |
| "epoch": 0.4363654132766498, |
| "grad_norm": 0.37667331099510193, |
| "learning_rate": 6.9329015504354605e-06, |
| "loss": 0.4422, |
| "step": 8930 |
| }, |
| { |
| "epoch": 0.43685406435534707, |
| "grad_norm": 1.4186402559280396, |
| "learning_rate": 6.925033198131347e-06, |
| "loss": 0.4428, |
| "step": 8940 |
| }, |
| { |
| "epoch": 0.4373427154340443, |
| "grad_norm": 0.6768743395805359, |
| "learning_rate": 6.917159244990328e-06, |
| "loss": 0.443, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.43783136651274157, |
| "grad_norm": 0.6607493162155151, |
| "learning_rate": 6.909279713921477e-06, |
| "loss": 0.4429, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.43832001759143885, |
| "grad_norm": 1.2457571029663086, |
| "learning_rate": 6.9013946278500964e-06, |
| "loss": 0.4431, |
| "step": 8970 |
| }, |
| { |
| "epoch": 0.43880866867013607, |
| "grad_norm": 0.506984531879425, |
| "learning_rate": 6.89350400971765e-06, |
| "loss": 0.444, |
| "step": 8980 |
| }, |
| { |
| "epoch": 0.43929731974883335, |
| "grad_norm": 0.9251278638839722, |
| "learning_rate": 6.885607882481699e-06, |
| "loss": 0.4426, |
| "step": 8990 |
| }, |
| { |
| "epoch": 0.4397859708275306, |
| "grad_norm": 1.2666517496109009, |
| "learning_rate": 6.8777062691158335e-06, |
| "loss": 0.4428, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4397859708275306, |
| "eval_loss": 0.4181945323944092, |
| "eval_runtime": 729.4373, |
| "eval_samples_per_second": 242.527, |
| "eval_steps_per_second": 0.474, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.44027462190622785, |
| "grad_norm": 0.909946620464325, |
| "learning_rate": 6.869799192609602e-06, |
| "loss": 0.4423, |
| "step": 9010 |
| }, |
| { |
| "epoch": 0.4407632729849251, |
| "grad_norm": 0.6974407434463501, |
| "learning_rate": 6.8618866759684496e-06, |
| "loss": 0.4421, |
| "step": 9020 |
| }, |
| { |
| "epoch": 0.44125192406362235, |
| "grad_norm": 1.4556212425231934, |
| "learning_rate": 6.85396874221365e-06, |
| "loss": 0.4421, |
| "step": 9030 |
| }, |
| { |
| "epoch": 0.4417405751423196, |
| "grad_norm": 0.7077080607414246, |
| "learning_rate": 6.846045414382237e-06, |
| "loss": 0.4415, |
| "step": 9040 |
| }, |
| { |
| "epoch": 0.4422292262210169, |
| "grad_norm": 1.2867698669433594, |
| "learning_rate": 6.838116715526941e-06, |
| "loss": 0.4431, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.4427178772997141, |
| "grad_norm": 0.350985586643219, |
| "learning_rate": 6.8301826687161135e-06, |
| "loss": 0.4425, |
| "step": 9060 |
| }, |
| { |
| "epoch": 0.4432065283784114, |
| "grad_norm": 0.9761406779289246, |
| "learning_rate": 6.822243297033671e-06, |
| "loss": 0.4415, |
| "step": 9070 |
| }, |
| { |
| "epoch": 0.4436951794571086, |
| "grad_norm": 0.7296372652053833, |
| "learning_rate": 6.814298623579021e-06, |
| "loss": 0.4432, |
| "step": 9080 |
| }, |
| { |
| "epoch": 0.4441838305358059, |
| "grad_norm": 0.8322256803512573, |
| "learning_rate": 6.806348671466996e-06, |
| "loss": 0.442, |
| "step": 9090 |
| }, |
| { |
| "epoch": 0.4446724816145032, |
| "grad_norm": 0.6768003106117249, |
| "learning_rate": 6.798393463827786e-06, |
| "loss": 0.442, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.4451611326932004, |
| "grad_norm": 0.9105594754219055, |
| "learning_rate": 6.790433023806874e-06, |
| "loss": 0.4426, |
| "step": 9110 |
| }, |
| { |
| "epoch": 0.4456497837718977, |
| "grad_norm": 0.8735663890838623, |
| "learning_rate": 6.782467374564964e-06, |
| "loss": 0.4414, |
| "step": 9120 |
| }, |
| { |
| "epoch": 0.44613843485059496, |
| "grad_norm": 0.4745177626609802, |
| "learning_rate": 6.774496539277917e-06, |
| "loss": 0.4428, |
| "step": 9130 |
| }, |
| { |
| "epoch": 0.4466270859292922, |
| "grad_norm": 0.35364508628845215, |
| "learning_rate": 6.766520541136684e-06, |
| "loss": 0.4425, |
| "step": 9140 |
| }, |
| { |
| "epoch": 0.44711573700798946, |
| "grad_norm": 1.5570448637008667, |
| "learning_rate": 6.758539403347235e-06, |
| "loss": 0.4423, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.4476043880866867, |
| "grad_norm": 0.6677067279815674, |
| "learning_rate": 6.750553149130498e-06, |
| "loss": 0.4425, |
| "step": 9160 |
| }, |
| { |
| "epoch": 0.44809303916538396, |
| "grad_norm": 0.5844752192497253, |
| "learning_rate": 6.74256180172228e-06, |
| "loss": 0.4427, |
| "step": 9170 |
| }, |
| { |
| "epoch": 0.44858169024408123, |
| "grad_norm": 0.5263113379478455, |
| "learning_rate": 6.734565384373211e-06, |
| "loss": 0.4419, |
| "step": 9180 |
| }, |
| { |
| "epoch": 0.44907034132277845, |
| "grad_norm": 0.7214266061782837, |
| "learning_rate": 6.726563920348671e-06, |
| "loss": 0.442, |
| "step": 9190 |
| }, |
| { |
| "epoch": 0.44955899240147573, |
| "grad_norm": 1.2973275184631348, |
| "learning_rate": 6.718557432928725e-06, |
| "loss": 0.4428, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.450047643480173, |
| "grad_norm": 1.9566432237625122, |
| "learning_rate": 6.7105459454080535e-06, |
| "loss": 0.4444, |
| "step": 9210 |
| }, |
| { |
| "epoch": 0.45053629455887023, |
| "grad_norm": 1.5999767780303955, |
| "learning_rate": 6.7025294810958785e-06, |
| "loss": 0.4439, |
| "step": 9220 |
| }, |
| { |
| "epoch": 0.4510249456375675, |
| "grad_norm": 1.2058864831924438, |
| "learning_rate": 6.6945080633159096e-06, |
| "loss": 0.4428, |
| "step": 9230 |
| }, |
| { |
| "epoch": 0.45151359671626473, |
| "grad_norm": 0.682574987411499, |
| "learning_rate": 6.686481715406264e-06, |
| "loss": 0.442, |
| "step": 9240 |
| }, |
| { |
| "epoch": 0.452002247794962, |
| "grad_norm": 0.6059571504592896, |
| "learning_rate": 6.678450460719405e-06, |
| "loss": 0.4428, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.4524908988736593, |
| "grad_norm": 0.9549880027770996, |
| "learning_rate": 6.670414322622072e-06, |
| "loss": 0.4421, |
| "step": 9260 |
| }, |
| { |
| "epoch": 0.4529795499523565, |
| "grad_norm": 0.7796644568443298, |
| "learning_rate": 6.66237332449521e-06, |
| "loss": 0.4428, |
| "step": 9270 |
| }, |
| { |
| "epoch": 0.4534682010310538, |
| "grad_norm": 1.1869465112686157, |
| "learning_rate": 6.6543274897339075e-06, |
| "loss": 0.4439, |
| "step": 9280 |
| }, |
| { |
| "epoch": 0.453956852109751, |
| "grad_norm": 4.104377269744873, |
| "learning_rate": 6.6462768417473215e-06, |
| "loss": 0.4455, |
| "step": 9290 |
| }, |
| { |
| "epoch": 0.4544455031884483, |
| "grad_norm": 0.8395638465881348, |
| "learning_rate": 6.638221403958616e-06, |
| "loss": 0.443, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.45493415426714556, |
| "grad_norm": 0.7057262659072876, |
| "learning_rate": 6.63016119980489e-06, |
| "loss": 0.443, |
| "step": 9310 |
| }, |
| { |
| "epoch": 0.4554228053458428, |
| "grad_norm": 1.067874789237976, |
| "learning_rate": 6.622096252737111e-06, |
| "loss": 0.4434, |
| "step": 9320 |
| }, |
| { |
| "epoch": 0.45591145642454006, |
| "grad_norm": 1.1366690397262573, |
| "learning_rate": 6.614026586220043e-06, |
| "loss": 0.4442, |
| "step": 9330 |
| }, |
| { |
| "epoch": 0.45640010750323734, |
| "grad_norm": 0.8740336298942566, |
| "learning_rate": 6.605952223732183e-06, |
| "loss": 0.4419, |
| "step": 9340 |
| }, |
| { |
| "epoch": 0.45688875858193456, |
| "grad_norm": 1.2686458826065063, |
| "learning_rate": 6.597873188765693e-06, |
| "loss": 0.4413, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.45737740966063184, |
| "grad_norm": 0.4457259774208069, |
| "learning_rate": 6.589789504826325e-06, |
| "loss": 0.4421, |
| "step": 9360 |
| }, |
| { |
| "epoch": 0.45786606073932906, |
| "grad_norm": 0.5987876057624817, |
| "learning_rate": 6.581701195433358e-06, |
| "loss": 0.4418, |
| "step": 9370 |
| }, |
| { |
| "epoch": 0.45835471181802634, |
| "grad_norm": 0.430936336517334, |
| "learning_rate": 6.573608284119536e-06, |
| "loss": 0.4415, |
| "step": 9380 |
| }, |
| { |
| "epoch": 0.4588433628967236, |
| "grad_norm": 0.9248373508453369, |
| "learning_rate": 6.565510794430978e-06, |
| "loss": 0.4408, |
| "step": 9390 |
| }, |
| { |
| "epoch": 0.45933201397542084, |
| "grad_norm": 0.5061573386192322, |
| "learning_rate": 6.557408749927139e-06, |
| "loss": 0.4436, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.4598206650541181, |
| "grad_norm": 0.6956728100776672, |
| "learning_rate": 6.5493021741807125e-06, |
| "loss": 0.4424, |
| "step": 9410 |
| }, |
| { |
| "epoch": 0.46030931613281534, |
| "grad_norm": 0.5525333881378174, |
| "learning_rate": 6.541191090777586e-06, |
| "loss": 0.4419, |
| "step": 9420 |
| }, |
| { |
| "epoch": 0.4607979672115126, |
| "grad_norm": 0.5926039218902588, |
| "learning_rate": 6.5330755233167586e-06, |
| "loss": 0.4417, |
| "step": 9430 |
| }, |
| { |
| "epoch": 0.4612866182902099, |
| "grad_norm": 0.7355937361717224, |
| "learning_rate": 6.524955495410271e-06, |
| "loss": 0.441, |
| "step": 9440 |
| }, |
| { |
| "epoch": 0.4617752693689071, |
| "grad_norm": 0.9713565111160278, |
| "learning_rate": 6.516831030683148e-06, |
| "loss": 0.4412, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.4622639204476044, |
| "grad_norm": 1.2393561601638794, |
| "learning_rate": 6.508702152773323e-06, |
| "loss": 0.4418, |
| "step": 9460 |
| }, |
| { |
| "epoch": 0.46275257152630167, |
| "grad_norm": 0.83049476146698, |
| "learning_rate": 6.5005688853315615e-06, |
| "loss": 0.4432, |
| "step": 9470 |
| }, |
| { |
| "epoch": 0.4632412226049989, |
| "grad_norm": 0.4689672291278839, |
| "learning_rate": 6.492431252021408e-06, |
| "loss": 0.4425, |
| "step": 9480 |
| }, |
| { |
| "epoch": 0.46372987368369617, |
| "grad_norm": 0.5514821410179138, |
| "learning_rate": 6.484289276519109e-06, |
| "loss": 0.442, |
| "step": 9490 |
| }, |
| { |
| "epoch": 0.4642185247623934, |
| "grad_norm": 0.4042249321937561, |
| "learning_rate": 6.47614298251354e-06, |
| "loss": 0.442, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.4642185247623934, |
| "eval_loss": 0.41554516553878784, |
| "eval_runtime": 729.5945, |
| "eval_samples_per_second": 242.474, |
| "eval_steps_per_second": 0.474, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.46470717584109067, |
| "grad_norm": 0.44334179162979126, |
| "learning_rate": 6.467992393706147e-06, |
| "loss": 0.4403, |
| "step": 9510 |
| }, |
| { |
| "epoch": 0.46519582691978795, |
| "grad_norm": 0.49329543113708496, |
| "learning_rate": 6.4598375338108656e-06, |
| "loss": 0.4418, |
| "step": 9520 |
| }, |
| { |
| "epoch": 0.46568447799848517, |
| "grad_norm": 0.5903816223144531, |
| "learning_rate": 6.451678426554061e-06, |
| "loss": 0.4409, |
| "step": 9530 |
| }, |
| { |
| "epoch": 0.46617312907718245, |
| "grad_norm": 1.2968121767044067, |
| "learning_rate": 6.443515095674456e-06, |
| "loss": 0.443, |
| "step": 9540 |
| }, |
| { |
| "epoch": 0.46666178015587967, |
| "grad_norm": 0.7769078612327576, |
| "learning_rate": 6.435347564923062e-06, |
| "loss": 0.4432, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.46715043123457695, |
| "grad_norm": 0.8744146823883057, |
| "learning_rate": 6.42717585806311e-06, |
| "loss": 0.4411, |
| "step": 9560 |
| }, |
| { |
| "epoch": 0.4676390823132742, |
| "grad_norm": 0.47742319107055664, |
| "learning_rate": 6.418999998869982e-06, |
| "loss": 0.4426, |
| "step": 9570 |
| }, |
| { |
| "epoch": 0.46812773339197145, |
| "grad_norm": 0.4284425973892212, |
| "learning_rate": 6.4108200111311355e-06, |
| "loss": 0.4426, |
| "step": 9580 |
| }, |
| { |
| "epoch": 0.4686163844706687, |
| "grad_norm": 0.37580737471580505, |
| "learning_rate": 6.402635918646049e-06, |
| "loss": 0.4425, |
| "step": 9590 |
| }, |
| { |
| "epoch": 0.469105035549366, |
| "grad_norm": 0.3638119399547577, |
| "learning_rate": 6.394447745226137e-06, |
| "loss": 0.4411, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.4695936866280632, |
| "grad_norm": 2.9128997325897217, |
| "learning_rate": 6.386255514694688e-06, |
| "loss": 0.4418, |
| "step": 9610 |
| }, |
| { |
| "epoch": 0.4700823377067605, |
| "grad_norm": 0.9645544290542603, |
| "learning_rate": 6.378059250886799e-06, |
| "loss": 0.4419, |
| "step": 9620 |
| }, |
| { |
| "epoch": 0.4705709887854577, |
| "grad_norm": 0.43301165103912354, |
| "learning_rate": 6.369858977649297e-06, |
| "loss": 0.4429, |
| "step": 9630 |
| }, |
| { |
| "epoch": 0.471059639864155, |
| "grad_norm": 1.5179802179336548, |
| "learning_rate": 6.361654718840675e-06, |
| "loss": 0.4414, |
| "step": 9640 |
| }, |
| { |
| "epoch": 0.4715482909428523, |
| "grad_norm": 0.3464379608631134, |
| "learning_rate": 6.353446498331024e-06, |
| "loss": 0.4428, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.4720369420215495, |
| "grad_norm": 0.89571613073349, |
| "learning_rate": 6.34523434000196e-06, |
| "loss": 0.441, |
| "step": 9660 |
| }, |
| { |
| "epoch": 0.4725255931002468, |
| "grad_norm": 0.6052807569503784, |
| "learning_rate": 6.337018267746558e-06, |
| "loss": 0.4412, |
| "step": 9670 |
| }, |
| { |
| "epoch": 0.473014244178944, |
| "grad_norm": 1.2590041160583496, |
| "learning_rate": 6.328798305469278e-06, |
| "loss": 0.4415, |
| "step": 9680 |
| }, |
| { |
| "epoch": 0.4735028952576413, |
| "grad_norm": 0.6158220171928406, |
| "learning_rate": 6.3205744770858965e-06, |
| "loss": 0.4419, |
| "step": 9690 |
| }, |
| { |
| "epoch": 0.47399154633633855, |
| "grad_norm": 0.46361032128334045, |
| "learning_rate": 6.312346806523444e-06, |
| "loss": 0.4417, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.4744801974150358, |
| "grad_norm": 1.2066395282745361, |
| "learning_rate": 6.304115317720123e-06, |
| "loss": 0.4415, |
| "step": 9710 |
| }, |
| { |
| "epoch": 0.47496884849373305, |
| "grad_norm": 0.9531863331794739, |
| "learning_rate": 6.295880034625251e-06, |
| "loss": 0.4421, |
| "step": 9720 |
| }, |
| { |
| "epoch": 0.47545749957243033, |
| "grad_norm": 0.3842741549015045, |
| "learning_rate": 6.287640981199183e-06, |
| "loss": 0.4412, |
| "step": 9730 |
| }, |
| { |
| "epoch": 0.47594615065112755, |
| "grad_norm": 0.3795391023159027, |
| "learning_rate": 6.27939818141324e-06, |
| "loss": 0.4414, |
| "step": 9740 |
| }, |
| { |
| "epoch": 0.47643480172982483, |
| "grad_norm": 0.5748067498207092, |
| "learning_rate": 6.2711516592496455e-06, |
| "loss": 0.4411, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.47692345280852205, |
| "grad_norm": 0.7015309929847717, |
| "learning_rate": 6.262901438701459e-06, |
| "loss": 0.4417, |
| "step": 9760 |
| }, |
| { |
| "epoch": 0.47741210388721933, |
| "grad_norm": 0.4260580539703369, |
| "learning_rate": 6.254647543772489e-06, |
| "loss": 0.4419, |
| "step": 9770 |
| }, |
| { |
| "epoch": 0.4779007549659166, |
| "grad_norm": 0.9640613198280334, |
| "learning_rate": 6.246389998477245e-06, |
| "loss": 0.4405, |
| "step": 9780 |
| }, |
| { |
| "epoch": 0.47838940604461383, |
| "grad_norm": 0.7557575106620789, |
| "learning_rate": 6.23812882684085e-06, |
| "loss": 0.4409, |
| "step": 9790 |
| }, |
| { |
| "epoch": 0.4788780571233111, |
| "grad_norm": 1.2757539749145508, |
| "learning_rate": 6.22986405289898e-06, |
| "loss": 0.4421, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.47936670820200833, |
| "grad_norm": 1.3108956813812256, |
| "learning_rate": 6.221595700697794e-06, |
| "loss": 0.4434, |
| "step": 9810 |
| }, |
| { |
| "epoch": 0.4798553592807056, |
| "grad_norm": 0.7379423379898071, |
| "learning_rate": 6.2133237942938594e-06, |
| "loss": 0.4423, |
| "step": 9820 |
| }, |
| { |
| "epoch": 0.4803440103594029, |
| "grad_norm": 0.6387200951576233, |
| "learning_rate": 6.2050483577540845e-06, |
| "loss": 0.4419, |
| "step": 9830 |
| }, |
| { |
| "epoch": 0.4808326614381001, |
| "grad_norm": 1.4142051935195923, |
| "learning_rate": 6.19676941515565e-06, |
| "loss": 0.4422, |
| "step": 9840 |
| }, |
| { |
| "epoch": 0.4813213125167974, |
| "grad_norm": 0.9402855038642883, |
| "learning_rate": 6.188486990585936e-06, |
| "loss": 0.4415, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.48180996359549466, |
| "grad_norm": 1.5236409902572632, |
| "learning_rate": 6.180201108142454e-06, |
| "loss": 0.4409, |
| "step": 9860 |
| }, |
| { |
| "epoch": 0.4822986146741919, |
| "grad_norm": 1.1364696025848389, |
| "learning_rate": 6.171911791932774e-06, |
| "loss": 0.4414, |
| "step": 9870 |
| }, |
| { |
| "epoch": 0.48278726575288916, |
| "grad_norm": 0.48199930787086487, |
| "learning_rate": 6.163619066074462e-06, |
| "loss": 0.4403, |
| "step": 9880 |
| }, |
| { |
| "epoch": 0.4832759168315864, |
| "grad_norm": 0.3505820333957672, |
| "learning_rate": 6.1553229546949975e-06, |
| "loss": 0.4394, |
| "step": 9890 |
| }, |
| { |
| "epoch": 0.48376456791028366, |
| "grad_norm": 1.0344538688659668, |
| "learning_rate": 6.147023481931716e-06, |
| "loss": 0.4408, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.48425321898898094, |
| "grad_norm": 0.39767566323280334, |
| "learning_rate": 6.138720671931726e-06, |
| "loss": 0.4408, |
| "step": 9910 |
| }, |
| { |
| "epoch": 0.48474187006767816, |
| "grad_norm": 0.6819673180580139, |
| "learning_rate": 6.130414548851854e-06, |
| "loss": 0.4412, |
| "step": 9920 |
| }, |
| { |
| "epoch": 0.48523052114637544, |
| "grad_norm": 1.247071623802185, |
| "learning_rate": 6.122105136858558e-06, |
| "loss": 0.4402, |
| "step": 9930 |
| }, |
| { |
| "epoch": 0.48571917222507266, |
| "grad_norm": 1.1983033418655396, |
| "learning_rate": 6.113792460127872e-06, |
| "loss": 0.442, |
| "step": 9940 |
| }, |
| { |
| "epoch": 0.48620782330376994, |
| "grad_norm": 0.9668486714363098, |
| "learning_rate": 6.105476542845324e-06, |
| "loss": 0.4421, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.4866964743824672, |
| "grad_norm": 0.5192340016365051, |
| "learning_rate": 6.097157409205867e-06, |
| "loss": 0.4415, |
| "step": 9960 |
| }, |
| { |
| "epoch": 0.48718512546116444, |
| "grad_norm": 1.4621678590774536, |
| "learning_rate": 6.088835083413823e-06, |
| "loss": 0.4413, |
| "step": 9970 |
| }, |
| { |
| "epoch": 0.4876737765398617, |
| "grad_norm": 0.4883491098880768, |
| "learning_rate": 6.080509589682793e-06, |
| "loss": 0.4417, |
| "step": 9980 |
| }, |
| { |
| "epoch": 0.488162427618559, |
| "grad_norm": 0.4201609194278717, |
| "learning_rate": 6.072180952235593e-06, |
| "loss": 0.4414, |
| "step": 9990 |
| }, |
| { |
| "epoch": 0.4886510786972562, |
| "grad_norm": 0.8927011489868164, |
| "learning_rate": 6.063849195304194e-06, |
| "loss": 0.4404, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.4886510786972562, |
| "eval_loss": 0.4168914556503296, |
| "eval_runtime": 729.6388, |
| "eval_samples_per_second": 242.46, |
| "eval_steps_per_second": 0.474, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.4891397297759535, |
| "grad_norm": 1.4013339281082153, |
| "learning_rate": 6.055514343129638e-06, |
| "loss": 0.4427, |
| "step": 10010 |
| }, |
| { |
| "epoch": 0.4896283808546507, |
| "grad_norm": 0.5623286366462708, |
| "learning_rate": 6.047176419961972e-06, |
| "loss": 0.4414, |
| "step": 10020 |
| }, |
| { |
| "epoch": 0.490117031933348, |
| "grad_norm": 0.5934634804725647, |
| "learning_rate": 6.038835450060181e-06, |
| "loss": 0.4419, |
| "step": 10030 |
| }, |
| { |
| "epoch": 0.49060568301204527, |
| "grad_norm": 0.5222377181053162, |
| "learning_rate": 6.030491457692108e-06, |
| "loss": 0.4415, |
| "step": 10040 |
| }, |
| { |
| "epoch": 0.4910943340907425, |
| "grad_norm": 0.4764785170555115, |
| "learning_rate": 6.022144467134399e-06, |
| "loss": 0.4407, |
| "step": 10050 |
| }, |
| { |
| "epoch": 0.49158298516943977, |
| "grad_norm": 0.738297700881958, |
| "learning_rate": 6.013794502672415e-06, |
| "loss": 0.442, |
| "step": 10060 |
| }, |
| { |
| "epoch": 0.49207163624813705, |
| "grad_norm": 5.993337631225586, |
| "learning_rate": 6.005441588600176e-06, |
| "loss": 0.4424, |
| "step": 10070 |
| }, |
| { |
| "epoch": 0.49256028732683427, |
| "grad_norm": 2.3225927352905273, |
| "learning_rate": 5.99708574922028e-06, |
| "loss": 0.4487, |
| "step": 10080 |
| }, |
| { |
| "epoch": 0.49304893840553154, |
| "grad_norm": 0.8819851875305176, |
| "learning_rate": 5.988727008843834e-06, |
| "loss": 0.443, |
| "step": 10090 |
| }, |
| { |
| "epoch": 0.49353758948422877, |
| "grad_norm": 0.8179062008857727, |
| "learning_rate": 5.980365391790392e-06, |
| "loss": 0.4415, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.49402624056292604, |
| "grad_norm": 1.1633132696151733, |
| "learning_rate": 5.97200092238787e-06, |
| "loss": 0.4415, |
| "step": 10110 |
| }, |
| { |
| "epoch": 0.4945148916416233, |
| "grad_norm": 0.5630601048469543, |
| "learning_rate": 5.963633624972491e-06, |
| "loss": 0.4421, |
| "step": 10120 |
| }, |
| { |
| "epoch": 0.49500354272032054, |
| "grad_norm": 0.95186847448349, |
| "learning_rate": 5.955263523888699e-06, |
| "loss": 0.4424, |
| "step": 10130 |
| }, |
| { |
| "epoch": 0.4954921937990178, |
| "grad_norm": 0.9137486219406128, |
| "learning_rate": 5.9468906434890995e-06, |
| "loss": 0.4409, |
| "step": 10140 |
| }, |
| { |
| "epoch": 0.49598084487771504, |
| "grad_norm": 0.5341358184814453, |
| "learning_rate": 5.938515008134381e-06, |
| "loss": 0.4407, |
| "step": 10150 |
| }, |
| { |
| "epoch": 0.4964694959564123, |
| "grad_norm": 0.8407842516899109, |
| "learning_rate": 5.9301366421932505e-06, |
| "loss": 0.4404, |
| "step": 10160 |
| }, |
| { |
| "epoch": 0.4969581470351096, |
| "grad_norm": 0.7001408338546753, |
| "learning_rate": 5.921755570042358e-06, |
| "loss": 0.4412, |
| "step": 10170 |
| }, |
| { |
| "epoch": 0.4974467981138068, |
| "grad_norm": 0.8030371069908142, |
| "learning_rate": 5.913371816066226e-06, |
| "loss": 0.4415, |
| "step": 10180 |
| }, |
| { |
| "epoch": 0.4979354491925041, |
| "grad_norm": 0.9030990600585938, |
| "learning_rate": 5.904985404657187e-06, |
| "loss": 0.4409, |
| "step": 10190 |
| }, |
| { |
| "epoch": 0.4984241002712014, |
| "grad_norm": 1.0445612668991089, |
| "learning_rate": 5.896596360215292e-06, |
| "loss": 0.4419, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.4989127513498986, |
| "grad_norm": 0.8249901533126831, |
| "learning_rate": 5.888204707148263e-06, |
| "loss": 0.4406, |
| "step": 10210 |
| }, |
| { |
| "epoch": 0.4994014024285959, |
| "grad_norm": 0.4994339048862457, |
| "learning_rate": 5.8798104698714095e-06, |
| "loss": 0.4397, |
| "step": 10220 |
| }, |
| { |
| "epoch": 0.4998900535072931, |
| "grad_norm": 0.5726603865623474, |
| "learning_rate": 5.87141367280756e-06, |
| "loss": 0.4403, |
| "step": 10230 |
| }, |
| { |
| "epoch": 0.5003787045859903, |
| "grad_norm": 0.7047241926193237, |
| "learning_rate": 5.863014340386988e-06, |
| "loss": 0.4416, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.5008673556646877, |
| "grad_norm": 0.730197012424469, |
| "learning_rate": 5.854612497047347e-06, |
| "loss": 0.4419, |
| "step": 10250 |
| }, |
| { |
| "epoch": 0.5013560067433849, |
| "grad_norm": 0.6394559741020203, |
| "learning_rate": 5.846208167233593e-06, |
| "loss": 0.4407, |
| "step": 10260 |
| }, |
| { |
| "epoch": 0.5018446578220821, |
| "grad_norm": 0.4507567882537842, |
| "learning_rate": 5.837801375397916e-06, |
| "loss": 0.4399, |
| "step": 10270 |
| }, |
| { |
| "epoch": 0.5023333089007794, |
| "grad_norm": 0.6874068975448608, |
| "learning_rate": 5.829392145999673e-06, |
| "loss": 0.442, |
| "step": 10280 |
| }, |
| { |
| "epoch": 0.5028219599794767, |
| "grad_norm": 0.48060235381126404, |
| "learning_rate": 5.820980503505311e-06, |
| "loss": 0.4397, |
| "step": 10290 |
| }, |
| { |
| "epoch": 0.5033106110581739, |
| "grad_norm": 0.4969087541103363, |
| "learning_rate": 5.812566472388298e-06, |
| "loss": 0.4399, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.5037992621368712, |
| "grad_norm": 0.8934044241905212, |
| "learning_rate": 5.804150077129049e-06, |
| "loss": 0.4406, |
| "step": 10310 |
| }, |
| { |
| "epoch": 0.5042879132155684, |
| "grad_norm": 0.6583065390586853, |
| "learning_rate": 5.795731342214861e-06, |
| "loss": 0.4406, |
| "step": 10320 |
| }, |
| { |
| "epoch": 0.5047765642942657, |
| "grad_norm": 0.7381777167320251, |
| "learning_rate": 5.787310292139837e-06, |
| "loss": 0.4414, |
| "step": 10330 |
| }, |
| { |
| "epoch": 0.505265215372963, |
| "grad_norm": 0.5181640386581421, |
| "learning_rate": 5.778886951404816e-06, |
| "loss": 0.4409, |
| "step": 10340 |
| }, |
| { |
| "epoch": 0.5057538664516602, |
| "grad_norm": 0.44236427545547485, |
| "learning_rate": 5.770461344517302e-06, |
| "loss": 0.4415, |
| "step": 10350 |
| }, |
| { |
| "epoch": 0.5062425175303574, |
| "grad_norm": 0.40523165464401245, |
| "learning_rate": 5.76203349599139e-06, |
| "loss": 0.4404, |
| "step": 10360 |
| }, |
| { |
| "epoch": 0.5067311686090546, |
| "grad_norm": 0.36556363105773926, |
| "learning_rate": 5.753603430347699e-06, |
| "loss": 0.443, |
| "step": 10370 |
| }, |
| { |
| "epoch": 0.507219819687752, |
| "grad_norm": 0.3584481477737427, |
| "learning_rate": 5.7451711721133e-06, |
| "loss": 0.44, |
| "step": 10380 |
| }, |
| { |
| "epoch": 0.5077084707664492, |
| "grad_norm": 0.5849773287773132, |
| "learning_rate": 5.736736745821641e-06, |
| "loss": 0.4398, |
| "step": 10390 |
| }, |
| { |
| "epoch": 0.5081971218451464, |
| "grad_norm": 1.39704167842865, |
| "learning_rate": 5.728300176012476e-06, |
| "loss": 0.4406, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.5086857729238438, |
| "grad_norm": 1.3421454429626465, |
| "learning_rate": 5.719861487231802e-06, |
| "loss": 0.4411, |
| "step": 10410 |
| }, |
| { |
| "epoch": 0.509174424002541, |
| "grad_norm": 0.8897213935852051, |
| "learning_rate": 5.711420704031774e-06, |
| "loss": 0.4418, |
| "step": 10420 |
| }, |
| { |
| "epoch": 0.5096630750812382, |
| "grad_norm": 0.8177825212478638, |
| "learning_rate": 5.702977850970646e-06, |
| "loss": 0.4414, |
| "step": 10430 |
| }, |
| { |
| "epoch": 0.5101517261599355, |
| "grad_norm": 0.5944052934646606, |
| "learning_rate": 5.694532952612692e-06, |
| "loss": 0.4406, |
| "step": 10440 |
| }, |
| { |
| "epoch": 0.5106403772386328, |
| "grad_norm": 0.48135659098625183, |
| "learning_rate": 5.686086033528135e-06, |
| "loss": 0.4409, |
| "step": 10450 |
| }, |
| { |
| "epoch": 0.51112902831733, |
| "grad_norm": 0.6524203419685364, |
| "learning_rate": 5.67763711829308e-06, |
| "loss": 0.4413, |
| "step": 10460 |
| }, |
| { |
| "epoch": 0.5116176793960273, |
| "grad_norm": 0.8007875084877014, |
| "learning_rate": 5.66918623148944e-06, |
| "loss": 0.4399, |
| "step": 10470 |
| }, |
| { |
| "epoch": 0.5121063304747245, |
| "grad_norm": 0.9331921339035034, |
| "learning_rate": 5.660733397704861e-06, |
| "loss": 0.4407, |
| "step": 10480 |
| }, |
| { |
| "epoch": 0.5125949815534218, |
| "grad_norm": 0.5154340863227844, |
| "learning_rate": 5.652278641532657e-06, |
| "loss": 0.4399, |
| "step": 10490 |
| }, |
| { |
| "epoch": 0.513083632632119, |
| "grad_norm": 0.5443922877311707, |
| "learning_rate": 5.643821987571732e-06, |
| "loss": 0.4418, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.513083632632119, |
| "eval_loss": 0.41731706261634827, |
| "eval_runtime": 729.1332, |
| "eval_samples_per_second": 242.628, |
| "eval_steps_per_second": 0.475, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5135722837108163, |
| "grad_norm": 0.7409442067146301, |
| "learning_rate": 5.635363460426516e-06, |
| "loss": 0.4416, |
| "step": 10510 |
| }, |
| { |
| "epoch": 0.5140609347895135, |
| "grad_norm": 0.5923414826393127, |
| "learning_rate": 5.6269030847068855e-06, |
| "loss": 0.4398, |
| "step": 10520 |
| }, |
| { |
| "epoch": 0.5145495858682108, |
| "grad_norm": 0.4530554711818695, |
| "learning_rate": 5.6184408850280955e-06, |
| "loss": 0.4408, |
| "step": 10530 |
| }, |
| { |
| "epoch": 0.5150382369469081, |
| "grad_norm": 0.49950364232063293, |
| "learning_rate": 5.609976886010708e-06, |
| "loss": 0.4409, |
| "step": 10540 |
| }, |
| { |
| "epoch": 0.5155268880256053, |
| "grad_norm": 2.171323776245117, |
| "learning_rate": 5.601511112280525e-06, |
| "loss": 0.4396, |
| "step": 10550 |
| }, |
| { |
| "epoch": 0.5160155391043025, |
| "grad_norm": 0.5502694249153137, |
| "learning_rate": 5.593043588468502e-06, |
| "loss": 0.4399, |
| "step": 10560 |
| }, |
| { |
| "epoch": 0.5165041901829999, |
| "grad_norm": 0.3139466941356659, |
| "learning_rate": 5.584574339210694e-06, |
| "loss": 0.4405, |
| "step": 10570 |
| }, |
| { |
| "epoch": 0.5169928412616971, |
| "grad_norm": 0.756894588470459, |
| "learning_rate": 5.576103389148175e-06, |
| "loss": 0.4401, |
| "step": 10580 |
| }, |
| { |
| "epoch": 0.5174814923403943, |
| "grad_norm": 0.5437245965003967, |
| "learning_rate": 5.567630762926967e-06, |
| "loss": 0.4412, |
| "step": 10590 |
| }, |
| { |
| "epoch": 0.5179701434190916, |
| "grad_norm": 0.796293318271637, |
| "learning_rate": 5.559156485197967e-06, |
| "loss": 0.441, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.5184587944977889, |
| "grad_norm": 0.642201840877533, |
| "learning_rate": 5.550680580616878e-06, |
| "loss": 0.4412, |
| "step": 10610 |
| }, |
| { |
| "epoch": 0.5189474455764861, |
| "grad_norm": 0.2663089632987976, |
| "learning_rate": 5.542203073844139e-06, |
| "loss": 0.441, |
| "step": 10620 |
| }, |
| { |
| "epoch": 0.5194360966551834, |
| "grad_norm": 0.45160502195358276, |
| "learning_rate": 5.533723989544844e-06, |
| "loss": 0.4404, |
| "step": 10630 |
| }, |
| { |
| "epoch": 0.5199247477338806, |
| "grad_norm": 0.4790808856487274, |
| "learning_rate": 5.525243352388686e-06, |
| "loss": 0.4402, |
| "step": 10640 |
| }, |
| { |
| "epoch": 0.5204133988125779, |
| "grad_norm": 0.3323618471622467, |
| "learning_rate": 5.5167611870498676e-06, |
| "loss": 0.4398, |
| "step": 10650 |
| }, |
| { |
| "epoch": 0.5209020498912751, |
| "grad_norm": 0.3828358054161072, |
| "learning_rate": 5.508277518207042e-06, |
| "loss": 0.4402, |
| "step": 10660 |
| }, |
| { |
| "epoch": 0.5213907009699724, |
| "grad_norm": 4.394709587097168, |
| "learning_rate": 5.499792370543236e-06, |
| "loss": 0.4401, |
| "step": 10670 |
| }, |
| { |
| "epoch": 0.5218793520486696, |
| "grad_norm": 0.34605780243873596, |
| "learning_rate": 5.491305768745776e-06, |
| "loss": 0.4409, |
| "step": 10680 |
| }, |
| { |
| "epoch": 0.5223680031273669, |
| "grad_norm": 0.41763895750045776, |
| "learning_rate": 5.4828177375062255e-06, |
| "loss": 0.4398, |
| "step": 10690 |
| }, |
| { |
| "epoch": 0.5228566542060642, |
| "grad_norm": 1.0943188667297363, |
| "learning_rate": 5.474328301520302e-06, |
| "loss": 0.4395, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.5233453052847614, |
| "grad_norm": 0.8608265519142151, |
| "learning_rate": 5.465837485487813e-06, |
| "loss": 0.4413, |
| "step": 10710 |
| }, |
| { |
| "epoch": 0.5238339563634586, |
| "grad_norm": 1.6863247156143188, |
| "learning_rate": 5.457345314112577e-06, |
| "loss": 0.4413, |
| "step": 10720 |
| }, |
| { |
| "epoch": 0.524322607442156, |
| "grad_norm": 0.5766188502311707, |
| "learning_rate": 5.448851812102357e-06, |
| "loss": 0.4406, |
| "step": 10730 |
| }, |
| { |
| "epoch": 0.5248112585208532, |
| "grad_norm": 0.84405517578125, |
| "learning_rate": 5.440357004168795e-06, |
| "loss": 0.441, |
| "step": 10740 |
| }, |
| { |
| "epoch": 0.5252999095995504, |
| "grad_norm": 0.7851320505142212, |
| "learning_rate": 5.431860915027321e-06, |
| "loss": 0.4402, |
| "step": 10750 |
| }, |
| { |
| "epoch": 0.5257885606782478, |
| "grad_norm": 0.4214421510696411, |
| "learning_rate": 5.423363569397101e-06, |
| "loss": 0.441, |
| "step": 10760 |
| }, |
| { |
| "epoch": 0.526277211756945, |
| "grad_norm": 1.1546157598495483, |
| "learning_rate": 5.4148649920009534e-06, |
| "loss": 0.4394, |
| "step": 10770 |
| }, |
| { |
| "epoch": 0.5267658628356422, |
| "grad_norm": 0.7156729102134705, |
| "learning_rate": 5.4063652075652786e-06, |
| "loss": 0.4404, |
| "step": 10780 |
| }, |
| { |
| "epoch": 0.5272545139143394, |
| "grad_norm": 1.8909116983413696, |
| "learning_rate": 5.3978642408199934e-06, |
| "loss": 0.4409, |
| "step": 10790 |
| }, |
| { |
| "epoch": 0.5277431649930368, |
| "grad_norm": 0.5709353685379028, |
| "learning_rate": 5.3893621164984524e-06, |
| "loss": 0.4403, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.528231816071734, |
| "grad_norm": 0.8182409405708313, |
| "learning_rate": 5.380858859337375e-06, |
| "loss": 0.4404, |
| "step": 10810 |
| }, |
| { |
| "epoch": 0.5287204671504312, |
| "grad_norm": 0.432432621717453, |
| "learning_rate": 5.372354494076784e-06, |
| "loss": 0.4402, |
| "step": 10820 |
| }, |
| { |
| "epoch": 0.5292091182291285, |
| "grad_norm": 0.8491529226303101, |
| "learning_rate": 5.363849045459918e-06, |
| "loss": 0.44, |
| "step": 10830 |
| }, |
| { |
| "epoch": 0.5296977693078257, |
| "grad_norm": 0.4220905900001526, |
| "learning_rate": 5.355342538233172e-06, |
| "loss": 0.4399, |
| "step": 10840 |
| }, |
| { |
| "epoch": 0.530186420386523, |
| "grad_norm": 1.0726776123046875, |
| "learning_rate": 5.346834997146023e-06, |
| "loss": 0.44, |
| "step": 10850 |
| }, |
| { |
| "epoch": 0.5306750714652203, |
| "grad_norm": 0.43123483657836914, |
| "learning_rate": 5.3383264469509484e-06, |
| "loss": 0.4411, |
| "step": 10860 |
| }, |
| { |
| "epoch": 0.5311637225439175, |
| "grad_norm": 0.3041502833366394, |
| "learning_rate": 5.32981691240337e-06, |
| "loss": 0.4414, |
| "step": 10870 |
| }, |
| { |
| "epoch": 0.5316523736226147, |
| "grad_norm": 0.7714064121246338, |
| "learning_rate": 5.321306418261572e-06, |
| "loss": 0.4402, |
| "step": 10880 |
| }, |
| { |
| "epoch": 0.5321410247013121, |
| "grad_norm": 0.441977322101593, |
| "learning_rate": 5.31279498928662e-06, |
| "loss": 0.44, |
| "step": 10890 |
| }, |
| { |
| "epoch": 0.5326296757800093, |
| "grad_norm": 1.5782145261764526, |
| "learning_rate": 5.304282650242318e-06, |
| "loss": 0.4406, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.5331183268587065, |
| "grad_norm": 0.678400993347168, |
| "learning_rate": 5.295769425895102e-06, |
| "loss": 0.4412, |
| "step": 10910 |
| }, |
| { |
| "epoch": 0.5336069779374037, |
| "grad_norm": 0.9773678183555603, |
| "learning_rate": 5.28725534101399e-06, |
| "loss": 0.4407, |
| "step": 10920 |
| }, |
| { |
| "epoch": 0.5340956290161011, |
| "grad_norm": 0.6579413414001465, |
| "learning_rate": 5.278740420370506e-06, |
| "loss": 0.442, |
| "step": 10930 |
| }, |
| { |
| "epoch": 0.5345842800947983, |
| "grad_norm": 0.760147213935852, |
| "learning_rate": 5.2702246887386e-06, |
| "loss": 0.4407, |
| "step": 10940 |
| }, |
| { |
| "epoch": 0.5350729311734955, |
| "grad_norm": 0.9420449137687683, |
| "learning_rate": 5.261708170894585e-06, |
| "loss": 0.4395, |
| "step": 10950 |
| }, |
| { |
| "epoch": 0.5355615822521929, |
| "grad_norm": 1.1415859460830688, |
| "learning_rate": 5.253190891617063e-06, |
| "loss": 0.4402, |
| "step": 10960 |
| }, |
| { |
| "epoch": 0.5360502333308901, |
| "grad_norm": 0.4278971552848816, |
| "learning_rate": 5.244672875686847e-06, |
| "loss": 0.4405, |
| "step": 10970 |
| }, |
| { |
| "epoch": 0.5365388844095873, |
| "grad_norm": 0.6837897300720215, |
| "learning_rate": 5.236154147886896e-06, |
| "loss": 0.4399, |
| "step": 10980 |
| }, |
| { |
| "epoch": 0.5370275354882846, |
| "grad_norm": 0.7087698578834534, |
| "learning_rate": 5.227634733002241e-06, |
| "loss": 0.4397, |
| "step": 10990 |
| }, |
| { |
| "epoch": 0.5375161865669819, |
| "grad_norm": 1.1717066764831543, |
| "learning_rate": 5.219114655819909e-06, |
| "loss": 0.4408, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5375161865669819, |
| "eval_loss": 0.4167872965335846, |
| "eval_runtime": 729.0747, |
| "eval_samples_per_second": 242.647, |
| "eval_steps_per_second": 0.475, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5380048376456791, |
| "grad_norm": 0.7513532638549805, |
| "learning_rate": 5.210593941128858e-06, |
| "loss": 0.4408, |
| "step": 11010 |
| }, |
| { |
| "epoch": 0.5384934887243764, |
| "grad_norm": 0.6454597115516663, |
| "learning_rate": 5.202072613719895e-06, |
| "loss": 0.4406, |
| "step": 11020 |
| }, |
| { |
| "epoch": 0.5389821398030736, |
| "grad_norm": 0.459091454744339, |
| "learning_rate": 5.193550698385616e-06, |
| "loss": 0.4411, |
| "step": 11030 |
| }, |
| { |
| "epoch": 0.5394707908817709, |
| "grad_norm": 0.40384477376937866, |
| "learning_rate": 5.185028219920325e-06, |
| "loss": 0.4406, |
| "step": 11040 |
| }, |
| { |
| "epoch": 0.5399594419604681, |
| "grad_norm": 0.44627973437309265, |
| "learning_rate": 5.1765052031199626e-06, |
| "loss": 0.4393, |
| "step": 11050 |
| }, |
| { |
| "epoch": 0.5404480930391654, |
| "grad_norm": 0.9470422267913818, |
| "learning_rate": 5.167981672782038e-06, |
| "loss": 0.4395, |
| "step": 11060 |
| }, |
| { |
| "epoch": 0.5409367441178626, |
| "grad_norm": 0.968473494052887, |
| "learning_rate": 5.1594576537055555e-06, |
| "loss": 0.4401, |
| "step": 11070 |
| }, |
| { |
| "epoch": 0.5414253951965599, |
| "grad_norm": 0.4251641631126404, |
| "learning_rate": 5.150933170690936e-06, |
| "loss": 0.439, |
| "step": 11080 |
| }, |
| { |
| "epoch": 0.5419140462752572, |
| "grad_norm": 0.5823407173156738, |
| "learning_rate": 5.142408248539956e-06, |
| "loss": 0.4398, |
| "step": 11090 |
| }, |
| { |
| "epoch": 0.5424026973539544, |
| "grad_norm": 0.7198439836502075, |
| "learning_rate": 5.133882912055669e-06, |
| "loss": 0.439, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.5428913484326516, |
| "grad_norm": 0.8078601360321045, |
| "learning_rate": 5.125357186042329e-06, |
| "loss": 0.44, |
| "step": 11110 |
| }, |
| { |
| "epoch": 0.543379999511349, |
| "grad_norm": 0.713046133518219, |
| "learning_rate": 5.116831095305331e-06, |
| "loss": 0.4398, |
| "step": 11120 |
| }, |
| { |
| "epoch": 0.5438686505900462, |
| "grad_norm": 0.5632086396217346, |
| "learning_rate": 5.108304664651123e-06, |
| "loss": 0.4398, |
| "step": 11130 |
| }, |
| { |
| "epoch": 0.5443573016687434, |
| "grad_norm": 1.3256471157073975, |
| "learning_rate": 5.099777918887149e-06, |
| "loss": 0.4396, |
| "step": 11140 |
| }, |
| { |
| "epoch": 0.5448459527474407, |
| "grad_norm": 0.9530927538871765, |
| "learning_rate": 5.0912508828217645e-06, |
| "loss": 0.4389, |
| "step": 11150 |
| }, |
| { |
| "epoch": 0.545334603826138, |
| "grad_norm": 2.566054582595825, |
| "learning_rate": 5.082723581264174e-06, |
| "loss": 0.44, |
| "step": 11160 |
| }, |
| { |
| "epoch": 0.5458232549048352, |
| "grad_norm": 0.6221000552177429, |
| "learning_rate": 5.074196039024351e-06, |
| "loss": 0.4399, |
| "step": 11170 |
| }, |
| { |
| "epoch": 0.5463119059835324, |
| "grad_norm": 0.5202614665031433, |
| "learning_rate": 5.065668280912972e-06, |
| "loss": 0.4394, |
| "step": 11180 |
| }, |
| { |
| "epoch": 0.5468005570622297, |
| "grad_norm": 0.9228209257125854, |
| "learning_rate": 5.057140331741337e-06, |
| "loss": 0.4402, |
| "step": 11190 |
| }, |
| { |
| "epoch": 0.547289208140927, |
| "grad_norm": 0.3940802216529846, |
| "learning_rate": 5.048612216321311e-06, |
| "loss": 0.4393, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.5477778592196242, |
| "grad_norm": 1.3075381517410278, |
| "learning_rate": 5.04008395946523e-06, |
| "loss": 0.4407, |
| "step": 11210 |
| }, |
| { |
| "epoch": 0.5482665102983215, |
| "grad_norm": 0.4319058060646057, |
| "learning_rate": 5.031555585985852e-06, |
| "loss": 0.4396, |
| "step": 11220 |
| }, |
| { |
| "epoch": 0.5487551613770187, |
| "grad_norm": 0.9323760867118835, |
| "learning_rate": 5.023027120696271e-06, |
| "loss": 0.4395, |
| "step": 11230 |
| }, |
| { |
| "epoch": 0.549243812455716, |
| "grad_norm": 0.726767361164093, |
| "learning_rate": 5.014498588409847e-06, |
| "loss": 0.4403, |
| "step": 11240 |
| }, |
| { |
| "epoch": 0.5497324635344133, |
| "grad_norm": 0.6504103541374207, |
| "learning_rate": 5.005970013940133e-06, |
| "loss": 0.4397, |
| "step": 11250 |
| }, |
| { |
| "epoch": 0.5502211146131105, |
| "grad_norm": 1.1144918203353882, |
| "learning_rate": 4.9974414221008125e-06, |
| "loss": 0.4412, |
| "step": 11260 |
| }, |
| { |
| "epoch": 0.5507097656918077, |
| "grad_norm": 0.6615655422210693, |
| "learning_rate": 4.98891283770561e-06, |
| "loss": 0.4397, |
| "step": 11270 |
| }, |
| { |
| "epoch": 0.5511984167705051, |
| "grad_norm": 0.5302955508232117, |
| "learning_rate": 4.980384285568235e-06, |
| "loss": 0.4395, |
| "step": 11280 |
| }, |
| { |
| "epoch": 0.5516870678492023, |
| "grad_norm": 0.4470592439174652, |
| "learning_rate": 4.9718557905023e-06, |
| "loss": 0.4402, |
| "step": 11290 |
| }, |
| { |
| "epoch": 0.5521757189278995, |
| "grad_norm": 0.4626651108264923, |
| "learning_rate": 4.963327377321253e-06, |
| "loss": 0.4382, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.5526643700065967, |
| "grad_norm": 0.48710301518440247, |
| "learning_rate": 4.954799070838304e-06, |
| "loss": 0.4404, |
| "step": 11310 |
| }, |
| { |
| "epoch": 0.5531530210852941, |
| "grad_norm": 0.31981727480888367, |
| "learning_rate": 4.946270895866347e-06, |
| "loss": 0.4391, |
| "step": 11320 |
| }, |
| { |
| "epoch": 0.5536416721639913, |
| "grad_norm": 1.153678297996521, |
| "learning_rate": 4.937742877217906e-06, |
| "loss": 0.4403, |
| "step": 11330 |
| }, |
| { |
| "epoch": 0.5541303232426885, |
| "grad_norm": 1.0284217596054077, |
| "learning_rate": 4.929215039705035e-06, |
| "loss": 0.4402, |
| "step": 11340 |
| }, |
| { |
| "epoch": 0.5546189743213858, |
| "grad_norm": 0.7204963564872742, |
| "learning_rate": 4.920687408139271e-06, |
| "loss": 0.439, |
| "step": 11350 |
| }, |
| { |
| "epoch": 0.5551076254000831, |
| "grad_norm": 0.6162496209144592, |
| "learning_rate": 4.91216000733155e-06, |
| "loss": 0.439, |
| "step": 11360 |
| }, |
| { |
| "epoch": 0.5555962764787803, |
| "grad_norm": 0.5891590118408203, |
| "learning_rate": 4.903632862092135e-06, |
| "loss": 0.439, |
| "step": 11370 |
| }, |
| { |
| "epoch": 0.5560849275574776, |
| "grad_norm": 0.5290629863739014, |
| "learning_rate": 4.895105997230544e-06, |
| "loss": 0.4407, |
| "step": 11380 |
| }, |
| { |
| "epoch": 0.5565735786361748, |
| "grad_norm": 1.0910426378250122, |
| "learning_rate": 4.886579437555484e-06, |
| "loss": 0.4386, |
| "step": 11390 |
| }, |
| { |
| "epoch": 0.5570622297148721, |
| "grad_norm": 0.4426107108592987, |
| "learning_rate": 4.878053207874771e-06, |
| "loss": 0.4393, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.5575508807935694, |
| "grad_norm": 0.7471179366111755, |
| "learning_rate": 4.8695273329952605e-06, |
| "loss": 0.4396, |
| "step": 11410 |
| }, |
| { |
| "epoch": 0.5580395318722666, |
| "grad_norm": 0.6447209119796753, |
| "learning_rate": 4.861001837722775e-06, |
| "loss": 0.4401, |
| "step": 11420 |
| }, |
| { |
| "epoch": 0.5585281829509638, |
| "grad_norm": 0.42997971177101135, |
| "learning_rate": 4.852476746862036e-06, |
| "loss": 0.4389, |
| "step": 11430 |
| }, |
| { |
| "epoch": 0.5590168340296611, |
| "grad_norm": 2.535978317260742, |
| "learning_rate": 4.8439520852165874e-06, |
| "loss": 0.4398, |
| "step": 11440 |
| }, |
| { |
| "epoch": 0.5595054851083584, |
| "grad_norm": 0.5462396740913391, |
| "learning_rate": 4.8354278775887215e-06, |
| "loss": 0.4402, |
| "step": 11450 |
| }, |
| { |
| "epoch": 0.5599941361870556, |
| "grad_norm": 0.6172703504562378, |
| "learning_rate": 4.8269041487794115e-06, |
| "loss": 0.4396, |
| "step": 11460 |
| }, |
| { |
| "epoch": 0.5604827872657528, |
| "grad_norm": 0.6260773539543152, |
| "learning_rate": 4.81838092358824e-06, |
| "loss": 0.4387, |
| "step": 11470 |
| }, |
| { |
| "epoch": 0.5609714383444502, |
| "grad_norm": 0.45732706785202026, |
| "learning_rate": 4.809858226813317e-06, |
| "loss": 0.4398, |
| "step": 11480 |
| }, |
| { |
| "epoch": 0.5614600894231474, |
| "grad_norm": 0.5570266246795654, |
| "learning_rate": 4.801336083251224e-06, |
| "loss": 0.4393, |
| "step": 11490 |
| }, |
| { |
| "epoch": 0.5619487405018446, |
| "grad_norm": 0.4119241535663605, |
| "learning_rate": 4.792814517696927e-06, |
| "loss": 0.4403, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.5619487405018446, |
| "eval_loss": 0.414816677570343, |
| "eval_runtime": 728.7625, |
| "eval_samples_per_second": 242.751, |
| "eval_steps_per_second": 0.475, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.562437391580542, |
| "grad_norm": 0.9447699785232544, |
| "learning_rate": 4.784293554943712e-06, |
| "loss": 0.4389, |
| "step": 11510 |
| }, |
| { |
| "epoch": 0.5629260426592392, |
| "grad_norm": 0.5308591723442078, |
| "learning_rate": 4.775773219783112e-06, |
| "loss": 0.4406, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5634146937379364, |
| "grad_norm": 1.3727697134017944, |
| "learning_rate": 4.767253537004832e-06, |
| "loss": 0.4401, |
| "step": 11530 |
| }, |
| { |
| "epoch": 0.5639033448166337, |
| "grad_norm": 0.9330416321754456, |
| "learning_rate": 4.7587345313966815e-06, |
| "loss": 0.4406, |
| "step": 11540 |
| }, |
| { |
| "epoch": 0.564391995895331, |
| "grad_norm": 0.32605278491973877, |
| "learning_rate": 4.7502162277445e-06, |
| "loss": 0.44, |
| "step": 11550 |
| }, |
| { |
| "epoch": 0.5648806469740282, |
| "grad_norm": 0.7518082857131958, |
| "learning_rate": 4.741698650832081e-06, |
| "loss": 0.4393, |
| "step": 11560 |
| }, |
| { |
| "epoch": 0.5653692980527254, |
| "grad_norm": 0.4452798068523407, |
| "learning_rate": 4.7331818254411046e-06, |
| "loss": 0.44, |
| "step": 11570 |
| }, |
| { |
| "epoch": 0.5658579491314227, |
| "grad_norm": 0.46914970874786377, |
| "learning_rate": 4.724665776351069e-06, |
| "loss": 0.44, |
| "step": 11580 |
| }, |
| { |
| "epoch": 0.56634660021012, |
| "grad_norm": 0.7172492146492004, |
| "learning_rate": 4.716150528339208e-06, |
| "loss": 0.4404, |
| "step": 11590 |
| }, |
| { |
| "epoch": 0.5668352512888172, |
| "grad_norm": 0.5215288996696472, |
| "learning_rate": 4.7076361061804264e-06, |
| "loss": 0.4399, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.5673239023675145, |
| "grad_norm": 0.5500718355178833, |
| "learning_rate": 4.69912253464723e-06, |
| "loss": 0.4399, |
| "step": 11610 |
| }, |
| { |
| "epoch": 0.5678125534462117, |
| "grad_norm": 0.9018455147743225, |
| "learning_rate": 4.690609838509642e-06, |
| "loss": 0.4396, |
| "step": 11620 |
| }, |
| { |
| "epoch": 0.568301204524909, |
| "grad_norm": 0.46901988983154297, |
| "learning_rate": 4.682098042535145e-06, |
| "loss": 0.4382, |
| "step": 11630 |
| }, |
| { |
| "epoch": 0.5687898556036063, |
| "grad_norm": 1.1770741939544678, |
| "learning_rate": 4.673587171488601e-06, |
| "loss": 0.4402, |
| "step": 11640 |
| }, |
| { |
| "epoch": 0.5692785066823035, |
| "grad_norm": 0.3521255552768707, |
| "learning_rate": 4.665077250132183e-06, |
| "loss": 0.4388, |
| "step": 11650 |
| }, |
| { |
| "epoch": 0.5697671577610007, |
| "grad_norm": 0.4423331618309021, |
| "learning_rate": 4.656568303225296e-06, |
| "loss": 0.4402, |
| "step": 11660 |
| }, |
| { |
| "epoch": 0.5702558088396981, |
| "grad_norm": 0.4402877390384674, |
| "learning_rate": 4.648060355524512e-06, |
| "loss": 0.4391, |
| "step": 11670 |
| }, |
| { |
| "epoch": 0.5707444599183953, |
| "grad_norm": 0.3995070457458496, |
| "learning_rate": 4.639553431783498e-06, |
| "loss": 0.4404, |
| "step": 11680 |
| }, |
| { |
| "epoch": 0.5712331109970925, |
| "grad_norm": 0.5264308452606201, |
| "learning_rate": 4.63104755675294e-06, |
| "loss": 0.4389, |
| "step": 11690 |
| }, |
| { |
| "epoch": 0.5717217620757897, |
| "grad_norm": 0.28230753540992737, |
| "learning_rate": 4.622542755180471e-06, |
| "loss": 0.4389, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.5722104131544871, |
| "grad_norm": 0.7925990223884583, |
| "learning_rate": 4.6140390518106034e-06, |
| "loss": 0.4395, |
| "step": 11710 |
| }, |
| { |
| "epoch": 0.5726990642331843, |
| "grad_norm": 1.0194525718688965, |
| "learning_rate": 4.605536471384656e-06, |
| "loss": 0.44, |
| "step": 11720 |
| }, |
| { |
| "epoch": 0.5731877153118815, |
| "grad_norm": 0.510903000831604, |
| "learning_rate": 4.597035038640676e-06, |
| "loss": 0.439, |
| "step": 11730 |
| }, |
| { |
| "epoch": 0.5736763663905788, |
| "grad_norm": 0.42407867312431335, |
| "learning_rate": 4.5885347783133725e-06, |
| "loss": 0.4401, |
| "step": 11740 |
| }, |
| { |
| "epoch": 0.5741650174692761, |
| "grad_norm": 0.5859852433204651, |
| "learning_rate": 4.580035715134047e-06, |
| "loss": 0.4381, |
| "step": 11750 |
| }, |
| { |
| "epoch": 0.5746536685479733, |
| "grad_norm": 0.5147973895072937, |
| "learning_rate": 4.571537873830515e-06, |
| "loss": 0.4399, |
| "step": 11760 |
| }, |
| { |
| "epoch": 0.5751423196266706, |
| "grad_norm": 0.6203701496124268, |
| "learning_rate": 4.563041279127038e-06, |
| "loss": 0.4389, |
| "step": 11770 |
| }, |
| { |
| "epoch": 0.5756309707053678, |
| "grad_norm": 0.4585236608982086, |
| "learning_rate": 4.554545955744247e-06, |
| "loss": 0.4383, |
| "step": 11780 |
| }, |
| { |
| "epoch": 0.5761196217840651, |
| "grad_norm": 0.41942375898361206, |
| "learning_rate": 4.546051928399081e-06, |
| "loss": 0.4386, |
| "step": 11790 |
| }, |
| { |
| "epoch": 0.5766082728627624, |
| "grad_norm": 0.5585193037986755, |
| "learning_rate": 4.537559221804703e-06, |
| "loss": 0.4389, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.5770969239414596, |
| "grad_norm": 0.4607734680175781, |
| "learning_rate": 4.529067860670433e-06, |
| "loss": 0.4388, |
| "step": 11810 |
| }, |
| { |
| "epoch": 0.5775855750201568, |
| "grad_norm": 0.6180665493011475, |
| "learning_rate": 4.520577869701679e-06, |
| "loss": 0.4382, |
| "step": 11820 |
| }, |
| { |
| "epoch": 0.5780742260988541, |
| "grad_norm": 0.7965272068977356, |
| "learning_rate": 4.5120892735998636e-06, |
| "loss": 0.4387, |
| "step": 11830 |
| }, |
| { |
| "epoch": 0.5785628771775514, |
| "grad_norm": 0.37461355328559875, |
| "learning_rate": 4.503602097062344e-06, |
| "loss": 0.4395, |
| "step": 11840 |
| }, |
| { |
| "epoch": 0.5790515282562486, |
| "grad_norm": 0.5917596817016602, |
| "learning_rate": 4.4951163647823595e-06, |
| "loss": 0.4385, |
| "step": 11850 |
| }, |
| { |
| "epoch": 0.5795401793349458, |
| "grad_norm": 0.47392183542251587, |
| "learning_rate": 4.486632101448935e-06, |
| "loss": 0.4372, |
| "step": 11860 |
| }, |
| { |
| "epoch": 0.5800288304136432, |
| "grad_norm": 0.43549230694770813, |
| "learning_rate": 4.478149331746829e-06, |
| "loss": 0.4387, |
| "step": 11870 |
| }, |
| { |
| "epoch": 0.5805174814923404, |
| "grad_norm": 0.5697550177574158, |
| "learning_rate": 4.469668080356451e-06, |
| "loss": 0.4387, |
| "step": 11880 |
| }, |
| { |
| "epoch": 0.5810061325710376, |
| "grad_norm": 0.3437957763671875, |
| "learning_rate": 4.461188371953795e-06, |
| "loss": 0.4388, |
| "step": 11890 |
| }, |
| { |
| "epoch": 0.581494783649735, |
| "grad_norm": 1.4066935777664185, |
| "learning_rate": 4.4527102312103624e-06, |
| "loss": 0.4402, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.5819834347284322, |
| "grad_norm": 0.5635364055633545, |
| "learning_rate": 4.4442336827930995e-06, |
| "loss": 0.4387, |
| "step": 11910 |
| }, |
| { |
| "epoch": 0.5824720858071294, |
| "grad_norm": 0.42688384652137756, |
| "learning_rate": 4.435758751364312e-06, |
| "loss": 0.4408, |
| "step": 11920 |
| }, |
| { |
| "epoch": 0.5829607368858267, |
| "grad_norm": 0.5010594725608826, |
| "learning_rate": 4.427285461581609e-06, |
| "loss": 0.4385, |
| "step": 11930 |
| }, |
| { |
| "epoch": 0.583449387964524, |
| "grad_norm": 0.6035897135734558, |
| "learning_rate": 4.418813838097815e-06, |
| "loss": 0.4402, |
| "step": 11940 |
| }, |
| { |
| "epoch": 0.5839380390432212, |
| "grad_norm": 0.7641412019729614, |
| "learning_rate": 4.410343905560916e-06, |
| "loss": 0.4391, |
| "step": 11950 |
| }, |
| { |
| "epoch": 0.5844266901219184, |
| "grad_norm": 0.4700312614440918, |
| "learning_rate": 4.401875688613971e-06, |
| "loss": 0.4379, |
| "step": 11960 |
| }, |
| { |
| "epoch": 0.5849153412006157, |
| "grad_norm": 0.9198450446128845, |
| "learning_rate": 4.3934092118950485e-06, |
| "loss": 0.4374, |
| "step": 11970 |
| }, |
| { |
| "epoch": 0.585403992279313, |
| "grad_norm": 0.896514356136322, |
| "learning_rate": 4.384944500037156e-06, |
| "loss": 0.4384, |
| "step": 11980 |
| }, |
| { |
| "epoch": 0.5858926433580102, |
| "grad_norm": 0.49591732025146484, |
| "learning_rate": 4.376481577668167e-06, |
| "loss": 0.44, |
| "step": 11990 |
| }, |
| { |
| "epoch": 0.5863812944367075, |
| "grad_norm": 0.5625073909759521, |
| "learning_rate": 4.368020469410742e-06, |
| "loss": 0.4389, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.5863812944367075, |
| "eval_loss": 0.41703999042510986, |
| "eval_runtime": 727.8065, |
| "eval_samples_per_second": 243.07, |
| "eval_steps_per_second": 0.475, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.5868699455154047, |
| "grad_norm": 0.6674771904945374, |
| "learning_rate": 4.359561199882272e-06, |
| "loss": 0.4393, |
| "step": 12010 |
| }, |
| { |
| "epoch": 0.587358596594102, |
| "grad_norm": 0.5143821239471436, |
| "learning_rate": 4.351103793694794e-06, |
| "loss": 0.4375, |
| "step": 12020 |
| }, |
| { |
| "epoch": 0.5878472476727993, |
| "grad_norm": 0.4788214862346649, |
| "learning_rate": 4.342648275454922e-06, |
| "loss": 0.4386, |
| "step": 12030 |
| }, |
| { |
| "epoch": 0.5883358987514965, |
| "grad_norm": 0.5421459078788757, |
| "learning_rate": 4.334194669763781e-06, |
| "loss": 0.4386, |
| "step": 12040 |
| }, |
| { |
| "epoch": 0.5888245498301937, |
| "grad_norm": 0.6345226168632507, |
| "learning_rate": 4.325743001216926e-06, |
| "loss": 0.4388, |
| "step": 12050 |
| }, |
| { |
| "epoch": 0.589313200908891, |
| "grad_norm": 1.1048717498779297, |
| "learning_rate": 4.317293294404285e-06, |
| "loss": 0.44, |
| "step": 12060 |
| }, |
| { |
| "epoch": 0.5898018519875883, |
| "grad_norm": 0.5707539916038513, |
| "learning_rate": 4.308845573910071e-06, |
| "loss": 0.4379, |
| "step": 12070 |
| }, |
| { |
| "epoch": 0.5902905030662855, |
| "grad_norm": 0.7084303498268127, |
| "learning_rate": 4.300399864312718e-06, |
| "loss": 0.4388, |
| "step": 12080 |
| }, |
| { |
| "epoch": 0.5907791541449827, |
| "grad_norm": 0.5199768543243408, |
| "learning_rate": 4.291956190184811e-06, |
| "loss": 0.4385, |
| "step": 12090 |
| }, |
| { |
| "epoch": 0.59126780522368, |
| "grad_norm": 0.35853302478790283, |
| "learning_rate": 4.283514576093015e-06, |
| "loss": 0.44, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.5917564563023773, |
| "grad_norm": 0.6634894609451294, |
| "learning_rate": 4.275075046597997e-06, |
| "loss": 0.4386, |
| "step": 12110 |
| }, |
| { |
| "epoch": 0.5922451073810745, |
| "grad_norm": 0.3389874994754791, |
| "learning_rate": 4.266637626254363e-06, |
| "loss": 0.439, |
| "step": 12120 |
| }, |
| { |
| "epoch": 0.5927337584597718, |
| "grad_norm": 0.38937532901763916, |
| "learning_rate": 4.258202339610581e-06, |
| "loss": 0.4389, |
| "step": 12130 |
| }, |
| { |
| "epoch": 0.593222409538469, |
| "grad_norm": 0.47301584482192993, |
| "learning_rate": 4.2497692112089086e-06, |
| "loss": 0.4382, |
| "step": 12140 |
| }, |
| { |
| "epoch": 0.5937110606171663, |
| "grad_norm": 0.4262164533138275, |
| "learning_rate": 4.241338265585327e-06, |
| "loss": 0.4384, |
| "step": 12150 |
| }, |
| { |
| "epoch": 0.5941997116958636, |
| "grad_norm": 0.3946975767612457, |
| "learning_rate": 4.232909527269465e-06, |
| "loss": 0.4389, |
| "step": 12160 |
| }, |
| { |
| "epoch": 0.5946883627745608, |
| "grad_norm": 0.30611652135849, |
| "learning_rate": 4.2244830207845335e-06, |
| "loss": 0.4384, |
| "step": 12170 |
| }, |
| { |
| "epoch": 0.595177013853258, |
| "grad_norm": 0.5015509128570557, |
| "learning_rate": 4.2160587706472445e-06, |
| "loss": 0.4386, |
| "step": 12180 |
| }, |
| { |
| "epoch": 0.5956656649319554, |
| "grad_norm": 2.779911518096924, |
| "learning_rate": 4.207636801367746e-06, |
| "loss": 0.4388, |
| "step": 12190 |
| }, |
| { |
| "epoch": 0.5961543160106526, |
| "grad_norm": 0.940437912940979, |
| "learning_rate": 4.199217137449553e-06, |
| "loss": 0.4403, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.5966429670893498, |
| "grad_norm": 1.1815273761749268, |
| "learning_rate": 4.190799803389472e-06, |
| "loss": 0.4384, |
| "step": 12210 |
| }, |
| { |
| "epoch": 0.597131618168047, |
| "grad_norm": 0.4534102976322174, |
| "learning_rate": 4.182384823677527e-06, |
| "loss": 0.4385, |
| "step": 12220 |
| }, |
| { |
| "epoch": 0.5976202692467444, |
| "grad_norm": 0.694245457649231, |
| "learning_rate": 4.173972222796897e-06, |
| "loss": 0.4382, |
| "step": 12230 |
| }, |
| { |
| "epoch": 0.5981089203254416, |
| "grad_norm": 0.5328917503356934, |
| "learning_rate": 4.165562025223839e-06, |
| "loss": 0.4392, |
| "step": 12240 |
| }, |
| { |
| "epoch": 0.5985975714041388, |
| "grad_norm": 0.7805267572402954, |
| "learning_rate": 4.157154255427613e-06, |
| "loss": 0.4394, |
| "step": 12250 |
| }, |
| { |
| "epoch": 0.5990862224828362, |
| "grad_norm": 0.6959843635559082, |
| "learning_rate": 4.148748937870425e-06, |
| "loss": 0.4366, |
| "step": 12260 |
| }, |
| { |
| "epoch": 0.5995748735615334, |
| "grad_norm": 0.9793679714202881, |
| "learning_rate": 4.140346097007336e-06, |
| "loss": 0.4383, |
| "step": 12270 |
| }, |
| { |
| "epoch": 0.6000635246402306, |
| "grad_norm": 0.3973505198955536, |
| "learning_rate": 4.1319457572862066e-06, |
| "loss": 0.4394, |
| "step": 12280 |
| }, |
| { |
| "epoch": 0.6005521757189279, |
| "grad_norm": 0.5687869191169739, |
| "learning_rate": 4.123547943147621e-06, |
| "loss": 0.4388, |
| "step": 12290 |
| }, |
| { |
| "epoch": 0.6010408267976252, |
| "grad_norm": 0.4026346802711487, |
| "learning_rate": 4.115152679024811e-06, |
| "loss": 0.4391, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.6015294778763224, |
| "grad_norm": 0.513808012008667, |
| "learning_rate": 4.106759989343594e-06, |
| "loss": 0.4381, |
| "step": 12310 |
| }, |
| { |
| "epoch": 0.6020181289550197, |
| "grad_norm": 0.36706215143203735, |
| "learning_rate": 4.0983698985222935e-06, |
| "loss": 0.4384, |
| "step": 12320 |
| }, |
| { |
| "epoch": 0.6025067800337169, |
| "grad_norm": 0.5302925705909729, |
| "learning_rate": 4.089982430971673e-06, |
| "loss": 0.4387, |
| "step": 12330 |
| }, |
| { |
| "epoch": 0.6029954311124142, |
| "grad_norm": 0.6953673362731934, |
| "learning_rate": 4.081597611094864e-06, |
| "loss": 0.4385, |
| "step": 12340 |
| }, |
| { |
| "epoch": 0.6034840821911114, |
| "grad_norm": 0.46951332688331604, |
| "learning_rate": 4.073215463287296e-06, |
| "loss": 0.4385, |
| "step": 12350 |
| }, |
| { |
| "epoch": 0.6039727332698087, |
| "grad_norm": 0.30504921078681946, |
| "learning_rate": 4.064836011936618e-06, |
| "loss": 0.4378, |
| "step": 12360 |
| }, |
| { |
| "epoch": 0.6044613843485059, |
| "grad_norm": 0.34291717410087585, |
| "learning_rate": 4.056459281422644e-06, |
| "loss": 0.4367, |
| "step": 12370 |
| }, |
| { |
| "epoch": 0.6049500354272032, |
| "grad_norm": 0.3311258852481842, |
| "learning_rate": 4.0480852961172635e-06, |
| "loss": 0.4387, |
| "step": 12380 |
| }, |
| { |
| "epoch": 0.6054386865059005, |
| "grad_norm": 0.48355287313461304, |
| "learning_rate": 4.039714080384381e-06, |
| "loss": 0.4385, |
| "step": 12390 |
| }, |
| { |
| "epoch": 0.6059273375845977, |
| "grad_norm": 0.6378800868988037, |
| "learning_rate": 4.031345658579846e-06, |
| "loss": 0.438, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.6064159886632949, |
| "grad_norm": 0.3167429566383362, |
| "learning_rate": 4.022980055051372e-06, |
| "loss": 0.4395, |
| "step": 12410 |
| }, |
| { |
| "epoch": 0.6069046397419923, |
| "grad_norm": 1.2204922437667847, |
| "learning_rate": 4.014617294138482e-06, |
| "loss": 0.4394, |
| "step": 12420 |
| }, |
| { |
| "epoch": 0.6073932908206895, |
| "grad_norm": 0.775138258934021, |
| "learning_rate": 4.006257400172422e-06, |
| "loss": 0.4393, |
| "step": 12430 |
| }, |
| { |
| "epoch": 0.6078819418993867, |
| "grad_norm": 0.5826382637023926, |
| "learning_rate": 3.9979003974760985e-06, |
| "loss": 0.4379, |
| "step": 12440 |
| }, |
| { |
| "epoch": 0.608370592978084, |
| "grad_norm": 0.5798311233520508, |
| "learning_rate": 3.989546310364005e-06, |
| "loss": 0.4379, |
| "step": 12450 |
| }, |
| { |
| "epoch": 0.6088592440567813, |
| "grad_norm": 0.749792218208313, |
| "learning_rate": 3.981195163142154e-06, |
| "loss": 0.4379, |
| "step": 12460 |
| }, |
| { |
| "epoch": 0.6093478951354785, |
| "grad_norm": 0.514415979385376, |
| "learning_rate": 3.972846980108005e-06, |
| "loss": 0.4391, |
| "step": 12470 |
| }, |
| { |
| "epoch": 0.6098365462141758, |
| "grad_norm": 0.38157758116722107, |
| "learning_rate": 3.964501785550392e-06, |
| "loss": 0.4375, |
| "step": 12480 |
| }, |
| { |
| "epoch": 0.610325197292873, |
| "grad_norm": 0.915421724319458, |
| "learning_rate": 3.956159603749452e-06, |
| "loss": 0.437, |
| "step": 12490 |
| }, |
| { |
| "epoch": 0.6108138483715703, |
| "grad_norm": 0.5357415080070496, |
| "learning_rate": 3.947820458976559e-06, |
| "loss": 0.4388, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.6108138483715703, |
| "eval_loss": 0.41651272773742676, |
| "eval_runtime": 728.6136, |
| "eval_samples_per_second": 242.801, |
| "eval_steps_per_second": 0.475, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.6113024994502675, |
| "grad_norm": 0.8443652391433716, |
| "learning_rate": 3.939484375494252e-06, |
| "loss": 0.4405, |
| "step": 12510 |
| }, |
| { |
| "epoch": 0.6117911505289648, |
| "grad_norm": 0.604301929473877, |
| "learning_rate": 3.931151377556159e-06, |
| "loss": 0.4383, |
| "step": 12520 |
| }, |
| { |
| "epoch": 0.612279801607662, |
| "grad_norm": 0.36815837025642395, |
| "learning_rate": 3.922821489406935e-06, |
| "loss": 0.4386, |
| "step": 12530 |
| }, |
| { |
| "epoch": 0.6127684526863593, |
| "grad_norm": 0.6259467005729675, |
| "learning_rate": 3.914494735282185e-06, |
| "loss": 0.4392, |
| "step": 12540 |
| }, |
| { |
| "epoch": 0.6132571037650566, |
| "grad_norm": 0.6359371542930603, |
| "learning_rate": 3.9061711394083965e-06, |
| "loss": 0.4392, |
| "step": 12550 |
| }, |
| { |
| "epoch": 0.6137457548437538, |
| "grad_norm": 0.5041322112083435, |
| "learning_rate": 3.897850726002864e-06, |
| "loss": 0.4399, |
| "step": 12560 |
| }, |
| { |
| "epoch": 0.614234405922451, |
| "grad_norm": 0.5972697138786316, |
| "learning_rate": 3.889533519273633e-06, |
| "loss": 0.4391, |
| "step": 12570 |
| }, |
| { |
| "epoch": 0.6147230570011484, |
| "grad_norm": 0.7836823463439941, |
| "learning_rate": 3.881219543419407e-06, |
| "loss": 0.4387, |
| "step": 12580 |
| }, |
| { |
| "epoch": 0.6152117080798456, |
| "grad_norm": 0.44390979409217834, |
| "learning_rate": 3.8729088226294995e-06, |
| "loss": 0.4384, |
| "step": 12590 |
| }, |
| { |
| "epoch": 0.6157003591585428, |
| "grad_norm": 0.32042884826660156, |
| "learning_rate": 3.8646013810837445e-06, |
| "loss": 0.4379, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.6161890102372402, |
| "grad_norm": 0.5421732664108276, |
| "learning_rate": 3.856297242952442e-06, |
| "loss": 0.4384, |
| "step": 12610 |
| }, |
| { |
| "epoch": 0.6166776613159374, |
| "grad_norm": 0.5136971473693848, |
| "learning_rate": 3.847996432396279e-06, |
| "loss": 0.4371, |
| "step": 12620 |
| }, |
| { |
| "epoch": 0.6171663123946346, |
| "grad_norm": 0.46279609203338623, |
| "learning_rate": 3.839698973566258e-06, |
| "loss": 0.4378, |
| "step": 12630 |
| }, |
| { |
| "epoch": 0.6176549634733318, |
| "grad_norm": 0.7376791834831238, |
| "learning_rate": 3.831404890603634e-06, |
| "loss": 0.4381, |
| "step": 12640 |
| }, |
| { |
| "epoch": 0.6181436145520292, |
| "grad_norm": 0.5303279757499695, |
| "learning_rate": 3.823114207639838e-06, |
| "loss": 0.4386, |
| "step": 12650 |
| }, |
| { |
| "epoch": 0.6186322656307264, |
| "grad_norm": 0.7225260138511658, |
| "learning_rate": 3.814826948796404e-06, |
| "loss": 0.438, |
| "step": 12660 |
| }, |
| { |
| "epoch": 0.6191209167094236, |
| "grad_norm": 0.8428411483764648, |
| "learning_rate": 3.8065431381849178e-06, |
| "loss": 0.4385, |
| "step": 12670 |
| }, |
| { |
| "epoch": 0.6196095677881209, |
| "grad_norm": 0.40499812364578247, |
| "learning_rate": 3.7982627999069164e-06, |
| "loss": 0.4382, |
| "step": 12680 |
| }, |
| { |
| "epoch": 0.6200982188668182, |
| "grad_norm": 0.44530633091926575, |
| "learning_rate": 3.7899859580538436e-06, |
| "loss": 0.4386, |
| "step": 12690 |
| }, |
| { |
| "epoch": 0.6205868699455154, |
| "grad_norm": 0.4268031418323517, |
| "learning_rate": 3.7817126367069674e-06, |
| "loss": 0.4374, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.6210755210242127, |
| "grad_norm": 0.2745535373687744, |
| "learning_rate": 3.773442859937313e-06, |
| "loss": 0.4383, |
| "step": 12710 |
| }, |
| { |
| "epoch": 0.6215641721029099, |
| "grad_norm": 0.5120725035667419, |
| "learning_rate": 3.765176651805593e-06, |
| "loss": 0.4383, |
| "step": 12720 |
| }, |
| { |
| "epoch": 0.6220528231816072, |
| "grad_norm": 0.3301103413105011, |
| "learning_rate": 3.7569140363621393e-06, |
| "loss": 0.4384, |
| "step": 12730 |
| }, |
| { |
| "epoch": 0.6225414742603045, |
| "grad_norm": 0.34257206320762634, |
| "learning_rate": 3.7486550376468266e-06, |
| "loss": 0.4366, |
| "step": 12740 |
| }, |
| { |
| "epoch": 0.6230301253390017, |
| "grad_norm": 0.37387409806251526, |
| "learning_rate": 3.7403996796890096e-06, |
| "loss": 0.4381, |
| "step": 12750 |
| }, |
| { |
| "epoch": 0.6235187764176989, |
| "grad_norm": 0.5832339525222778, |
| "learning_rate": 3.732147986507453e-06, |
| "loss": 0.4389, |
| "step": 12760 |
| }, |
| { |
| "epoch": 0.6240074274963961, |
| "grad_norm": 0.39319491386413574, |
| "learning_rate": 3.723899982110249e-06, |
| "loss": 0.4379, |
| "step": 12770 |
| }, |
| { |
| "epoch": 0.6244960785750935, |
| "grad_norm": 1.1208192110061646, |
| "learning_rate": 3.7156556904947725e-06, |
| "loss": 0.4374, |
| "step": 12780 |
| }, |
| { |
| "epoch": 0.6249847296537907, |
| "grad_norm": 1.2163150310516357, |
| "learning_rate": 3.7074151356475828e-06, |
| "loss": 0.4386, |
| "step": 12790 |
| }, |
| { |
| "epoch": 0.6254733807324879, |
| "grad_norm": 0.5749249458312988, |
| "learning_rate": 3.6991783415443726e-06, |
| "loss": 0.4376, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6259620318111853, |
| "grad_norm": 0.3662860095500946, |
| "learning_rate": 3.6909453321498954e-06, |
| "loss": 0.4387, |
| "step": 12810 |
| }, |
| { |
| "epoch": 0.6264506828898825, |
| "grad_norm": 0.7711629271507263, |
| "learning_rate": 3.682716131417887e-06, |
| "loss": 0.4387, |
| "step": 12820 |
| }, |
| { |
| "epoch": 0.6269393339685797, |
| "grad_norm": 0.4106141924858093, |
| "learning_rate": 3.6744907632910064e-06, |
| "loss": 0.4376, |
| "step": 12830 |
| }, |
| { |
| "epoch": 0.627427985047277, |
| "grad_norm": 0.8427706956863403, |
| "learning_rate": 3.6662692517007613e-06, |
| "loss": 0.4376, |
| "step": 12840 |
| }, |
| { |
| "epoch": 0.6279166361259743, |
| "grad_norm": 0.4671982526779175, |
| "learning_rate": 3.6580516205674367e-06, |
| "loss": 0.4375, |
| "step": 12850 |
| }, |
| { |
| "epoch": 0.6284052872046715, |
| "grad_norm": 0.643839955329895, |
| "learning_rate": 3.64983789380003e-06, |
| "loss": 0.4382, |
| "step": 12860 |
| }, |
| { |
| "epoch": 0.6288939382833688, |
| "grad_norm": 0.3143644630908966, |
| "learning_rate": 3.6416280952961756e-06, |
| "loss": 0.4378, |
| "step": 12870 |
| }, |
| { |
| "epoch": 0.629382589362066, |
| "grad_norm": 0.5174784064292908, |
| "learning_rate": 3.6334222489420845e-06, |
| "loss": 0.4386, |
| "step": 12880 |
| }, |
| { |
| "epoch": 0.6298712404407633, |
| "grad_norm": 0.35816308856010437, |
| "learning_rate": 3.625220378612465e-06, |
| "loss": 0.4381, |
| "step": 12890 |
| }, |
| { |
| "epoch": 0.6303598915194605, |
| "grad_norm": 0.4106110632419586, |
| "learning_rate": 3.617022508170456e-06, |
| "loss": 0.4372, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.6308485425981578, |
| "grad_norm": 1.5037391185760498, |
| "learning_rate": 3.608828661467561e-06, |
| "loss": 0.4366, |
| "step": 12910 |
| }, |
| { |
| "epoch": 0.631337193676855, |
| "grad_norm": 0.6403370499610901, |
| "learning_rate": 3.6006388623435778e-06, |
| "loss": 0.4373, |
| "step": 12920 |
| }, |
| { |
| "epoch": 0.6318258447555523, |
| "grad_norm": 0.4930186867713928, |
| "learning_rate": 3.5924531346265235e-06, |
| "loss": 0.4379, |
| "step": 12930 |
| }, |
| { |
| "epoch": 0.6323144958342496, |
| "grad_norm": 0.3067891001701355, |
| "learning_rate": 3.5842715021325745e-06, |
| "loss": 0.4368, |
| "step": 12940 |
| }, |
| { |
| "epoch": 0.6328031469129468, |
| "grad_norm": 0.7694682478904724, |
| "learning_rate": 3.5760939886659896e-06, |
| "loss": 0.438, |
| "step": 12950 |
| }, |
| { |
| "epoch": 0.633291797991644, |
| "grad_norm": 0.5115815997123718, |
| "learning_rate": 3.567920618019043e-06, |
| "loss": 0.4377, |
| "step": 12960 |
| }, |
| { |
| "epoch": 0.6337804490703414, |
| "grad_norm": 0.6964974999427795, |
| "learning_rate": 3.559751413971955e-06, |
| "loss": 0.4375, |
| "step": 12970 |
| }, |
| { |
| "epoch": 0.6342691001490386, |
| "grad_norm": 0.5830658078193665, |
| "learning_rate": 3.551586400292828e-06, |
| "loss": 0.4381, |
| "step": 12980 |
| }, |
| { |
| "epoch": 0.6347577512277358, |
| "grad_norm": 0.8513720035552979, |
| "learning_rate": 3.5434256007375666e-06, |
| "loss": 0.4376, |
| "step": 12990 |
| }, |
| { |
| "epoch": 0.6352464023064331, |
| "grad_norm": 0.4420766234397888, |
| "learning_rate": 3.535269039049819e-06, |
| "loss": 0.436, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6352464023064331, |
| "eval_loss": 0.41433966159820557, |
| "eval_runtime": 729.8346, |
| "eval_samples_per_second": 242.395, |
| "eval_steps_per_second": 0.474, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6357350533851304, |
| "grad_norm": 1.997313380241394, |
| "learning_rate": 3.5271167389608996e-06, |
| "loss": 0.4376, |
| "step": 13010 |
| }, |
| { |
| "epoch": 0.6362237044638276, |
| "grad_norm": 0.5262558460235596, |
| "learning_rate": 3.518968724189727e-06, |
| "loss": 0.439, |
| "step": 13020 |
| }, |
| { |
| "epoch": 0.6367123555425248, |
| "grad_norm": 0.7942774295806885, |
| "learning_rate": 3.5108250184427507e-06, |
| "loss": 0.4368, |
| "step": 13030 |
| }, |
| { |
| "epoch": 0.6372010066212221, |
| "grad_norm": 0.44571954011917114, |
| "learning_rate": 3.50268564541388e-06, |
| "loss": 0.4386, |
| "step": 13040 |
| }, |
| { |
| "epoch": 0.6376896576999194, |
| "grad_norm": 0.3043385148048401, |
| "learning_rate": 3.4945506287844245e-06, |
| "loss": 0.4377, |
| "step": 13050 |
| }, |
| { |
| "epoch": 0.6381783087786166, |
| "grad_norm": 0.41446077823638916, |
| "learning_rate": 3.4864199922230156e-06, |
| "loss": 0.4376, |
| "step": 13060 |
| }, |
| { |
| "epoch": 0.6386669598573139, |
| "grad_norm": 0.361817866563797, |
| "learning_rate": 3.4782937593855386e-06, |
| "loss": 0.4368, |
| "step": 13070 |
| }, |
| { |
| "epoch": 0.6391556109360111, |
| "grad_norm": 0.2429763674736023, |
| "learning_rate": 3.4701719539150692e-06, |
| "loss": 0.4384, |
| "step": 13080 |
| }, |
| { |
| "epoch": 0.6396442620147084, |
| "grad_norm": 0.6479557156562805, |
| "learning_rate": 3.4620545994418044e-06, |
| "loss": 0.4369, |
| "step": 13090 |
| }, |
| { |
| "epoch": 0.6401329130934057, |
| "grad_norm": 0.2731977105140686, |
| "learning_rate": 3.453941719582985e-06, |
| "loss": 0.4367, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.6406215641721029, |
| "grad_norm": 0.3237663209438324, |
| "learning_rate": 3.445833337942838e-06, |
| "loss": 0.4389, |
| "step": 13110 |
| }, |
| { |
| "epoch": 0.6411102152508001, |
| "grad_norm": 0.6228281855583191, |
| "learning_rate": 3.4377294781124997e-06, |
| "loss": 0.4361, |
| "step": 13120 |
| }, |
| { |
| "epoch": 0.6415988663294975, |
| "grad_norm": 0.7347028255462646, |
| "learning_rate": 3.4296301636699527e-06, |
| "loss": 0.4378, |
| "step": 13130 |
| }, |
| { |
| "epoch": 0.6420875174081947, |
| "grad_norm": 0.8885689377784729, |
| "learning_rate": 3.421535418179953e-06, |
| "loss": 0.4379, |
| "step": 13140 |
| }, |
| { |
| "epoch": 0.6425761684868919, |
| "grad_norm": 0.7151497602462769, |
| "learning_rate": 3.413445265193964e-06, |
| "loss": 0.4373, |
| "step": 13150 |
| }, |
| { |
| "epoch": 0.6430648195655891, |
| "grad_norm": 0.46183907985687256, |
| "learning_rate": 3.4053597282500882e-06, |
| "loss": 0.4373, |
| "step": 13160 |
| }, |
| { |
| "epoch": 0.6435534706442865, |
| "grad_norm": 0.7960475087165833, |
| "learning_rate": 3.397278830872998e-06, |
| "loss": 0.4358, |
| "step": 13170 |
| }, |
| { |
| "epoch": 0.6440421217229837, |
| "grad_norm": 0.5535709261894226, |
| "learning_rate": 3.3892025965738616e-06, |
| "loss": 0.4373, |
| "step": 13180 |
| }, |
| { |
| "epoch": 0.6445307728016809, |
| "grad_norm": 0.8837286829948425, |
| "learning_rate": 3.3811310488502924e-06, |
| "loss": 0.4372, |
| "step": 13190 |
| }, |
| { |
| "epoch": 0.6450194238803783, |
| "grad_norm": 0.5701731443405151, |
| "learning_rate": 3.3730642111862543e-06, |
| "loss": 0.4381, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.6455080749590755, |
| "grad_norm": 0.449485182762146, |
| "learning_rate": 3.365002107052017e-06, |
| "loss": 0.4381, |
| "step": 13210 |
| }, |
| { |
| "epoch": 0.6459967260377727, |
| "grad_norm": 0.43208661675453186, |
| "learning_rate": 3.356944759904075e-06, |
| "loss": 0.4387, |
| "step": 13220 |
| }, |
| { |
| "epoch": 0.64648537711647, |
| "grad_norm": 0.5452390313148499, |
| "learning_rate": 3.3488921931850794e-06, |
| "loss": 0.4374, |
| "step": 13230 |
| }, |
| { |
| "epoch": 0.6469740281951673, |
| "grad_norm": 0.37224072217941284, |
| "learning_rate": 3.3408444303237786e-06, |
| "loss": 0.4376, |
| "step": 13240 |
| }, |
| { |
| "epoch": 0.6474626792738645, |
| "grad_norm": 0.6517390608787537, |
| "learning_rate": 3.3328014947349406e-06, |
| "loss": 0.4377, |
| "step": 13250 |
| }, |
| { |
| "epoch": 0.6479513303525618, |
| "grad_norm": 0.3955247402191162, |
| "learning_rate": 3.3247634098192884e-06, |
| "loss": 0.4388, |
| "step": 13260 |
| }, |
| { |
| "epoch": 0.648439981431259, |
| "grad_norm": 0.3447047770023346, |
| "learning_rate": 3.316730198963433e-06, |
| "loss": 0.4377, |
| "step": 13270 |
| }, |
| { |
| "epoch": 0.6489286325099562, |
| "grad_norm": 0.8046542406082153, |
| "learning_rate": 3.3087018855398045e-06, |
| "loss": 0.4374, |
| "step": 13280 |
| }, |
| { |
| "epoch": 0.6494172835886535, |
| "grad_norm": 0.5053970217704773, |
| "learning_rate": 3.300678492906586e-06, |
| "loss": 0.4377, |
| "step": 13290 |
| }, |
| { |
| "epoch": 0.6499059346673508, |
| "grad_norm": 1.129328727722168, |
| "learning_rate": 3.292660044407642e-06, |
| "loss": 0.4373, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.650394585746048, |
| "grad_norm": 1.0235140323638916, |
| "learning_rate": 3.2846465633724487e-06, |
| "loss": 0.438, |
| "step": 13310 |
| }, |
| { |
| "epoch": 0.6508832368247452, |
| "grad_norm": 1.2467355728149414, |
| "learning_rate": 3.2766380731160342e-06, |
| "loss": 0.4376, |
| "step": 13320 |
| }, |
| { |
| "epoch": 0.6513718879034426, |
| "grad_norm": 0.42103078961372375, |
| "learning_rate": 3.268634596938906e-06, |
| "loss": 0.4369, |
| "step": 13330 |
| }, |
| { |
| "epoch": 0.6518605389821398, |
| "grad_norm": 0.6862124800682068, |
| "learning_rate": 3.26063615812698e-06, |
| "loss": 0.4384, |
| "step": 13340 |
| }, |
| { |
| "epoch": 0.652349190060837, |
| "grad_norm": 0.4259004294872284, |
| "learning_rate": 3.252642779951518e-06, |
| "loss": 0.4385, |
| "step": 13350 |
| }, |
| { |
| "epoch": 0.6528378411395344, |
| "grad_norm": 0.3901737630367279, |
| "learning_rate": 3.2446544856690595e-06, |
| "loss": 0.4375, |
| "step": 13360 |
| }, |
| { |
| "epoch": 0.6533264922182316, |
| "grad_norm": 0.5543071627616882, |
| "learning_rate": 3.236671298521349e-06, |
| "loss": 0.4373, |
| "step": 13370 |
| }, |
| { |
| "epoch": 0.6538151432969288, |
| "grad_norm": 0.4534682333469391, |
| "learning_rate": 3.228693241735274e-06, |
| "loss": 0.4379, |
| "step": 13380 |
| }, |
| { |
| "epoch": 0.6543037943756261, |
| "grad_norm": 0.25920426845550537, |
| "learning_rate": 3.220720338522795e-06, |
| "loss": 0.4371, |
| "step": 13390 |
| }, |
| { |
| "epoch": 0.6547924454543234, |
| "grad_norm": 0.3495095372200012, |
| "learning_rate": 3.2127526120808807e-06, |
| "loss": 0.4386, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.6552810965330206, |
| "grad_norm": 0.5815818309783936, |
| "learning_rate": 3.204790085591435e-06, |
| "loss": 0.4386, |
| "step": 13410 |
| }, |
| { |
| "epoch": 0.6557697476117178, |
| "grad_norm": 0.48536261916160583, |
| "learning_rate": 3.1968327822212325e-06, |
| "loss": 0.4376, |
| "step": 13420 |
| }, |
| { |
| "epoch": 0.6562583986904151, |
| "grad_norm": 0.46819832921028137, |
| "learning_rate": 3.1888807251218524e-06, |
| "loss": 0.4364, |
| "step": 13430 |
| }, |
| { |
| "epoch": 0.6567470497691124, |
| "grad_norm": 0.6933978199958801, |
| "learning_rate": 3.180933937429612e-06, |
| "loss": 0.4366, |
| "step": 13440 |
| }, |
| { |
| "epoch": 0.6572357008478096, |
| "grad_norm": 0.6232645511627197, |
| "learning_rate": 3.1729924422654917e-06, |
| "loss": 0.4372, |
| "step": 13450 |
| }, |
| { |
| "epoch": 0.6577243519265069, |
| "grad_norm": 0.6195946335792542, |
| "learning_rate": 3.1650562627350797e-06, |
| "loss": 0.4379, |
| "step": 13460 |
| }, |
| { |
| "epoch": 0.6582130030052041, |
| "grad_norm": 0.47936639189720154, |
| "learning_rate": 3.157125421928496e-06, |
| "loss": 0.4375, |
| "step": 13470 |
| }, |
| { |
| "epoch": 0.6587016540839014, |
| "grad_norm": 0.7483202219009399, |
| "learning_rate": 3.1491999429203253e-06, |
| "loss": 0.4375, |
| "step": 13480 |
| }, |
| { |
| "epoch": 0.6591903051625987, |
| "grad_norm": 0.6134311556816101, |
| "learning_rate": 3.141279848769555e-06, |
| "loss": 0.4373, |
| "step": 13490 |
| }, |
| { |
| "epoch": 0.6596789562412959, |
| "grad_norm": 0.623481810092926, |
| "learning_rate": 3.1333651625195065e-06, |
| "loss": 0.4377, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.6596789562412959, |
| "eval_loss": 0.41565391421318054, |
| "eval_runtime": 728.9931, |
| "eval_samples_per_second": 242.674, |
| "eval_steps_per_second": 0.475, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.6601676073199931, |
| "grad_norm": 0.6795092225074768, |
| "learning_rate": 3.125455907197765e-06, |
| "loss": 0.4376, |
| "step": 13510 |
| }, |
| { |
| "epoch": 0.6606562583986905, |
| "grad_norm": 0.766830325126648, |
| "learning_rate": 3.117552105816116e-06, |
| "loss": 0.4361, |
| "step": 13520 |
| }, |
| { |
| "epoch": 0.6611449094773877, |
| "grad_norm": 1.3391401767730713, |
| "learning_rate": 3.109653781370473e-06, |
| "loss": 0.4367, |
| "step": 13530 |
| }, |
| { |
| "epoch": 0.6616335605560849, |
| "grad_norm": 0.4134541451931, |
| "learning_rate": 3.101760956840819e-06, |
| "loss": 0.4379, |
| "step": 13540 |
| }, |
| { |
| "epoch": 0.6621222116347821, |
| "grad_norm": 0.8742764592170715, |
| "learning_rate": 3.093873655191135e-06, |
| "loss": 0.4365, |
| "step": 13550 |
| }, |
| { |
| "epoch": 0.6626108627134795, |
| "grad_norm": 0.5794457197189331, |
| "learning_rate": 3.0859918993693294e-06, |
| "loss": 0.4375, |
| "step": 13560 |
| }, |
| { |
| "epoch": 0.6630995137921767, |
| "grad_norm": 0.5920469760894775, |
| "learning_rate": 3.0781157123071782e-06, |
| "loss": 0.4366, |
| "step": 13570 |
| }, |
| { |
| "epoch": 0.6635881648708739, |
| "grad_norm": 0.8090350031852722, |
| "learning_rate": 3.070245116920255e-06, |
| "loss": 0.4375, |
| "step": 13580 |
| }, |
| { |
| "epoch": 0.6640768159495712, |
| "grad_norm": 0.6335200071334839, |
| "learning_rate": 3.062380136107863e-06, |
| "loss": 0.4372, |
| "step": 13590 |
| }, |
| { |
| "epoch": 0.6645654670282685, |
| "grad_norm": 0.5986902117729187, |
| "learning_rate": 3.054520792752973e-06, |
| "loss": 0.4369, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.6650541181069657, |
| "grad_norm": 0.5582621693611145, |
| "learning_rate": 3.0466671097221506e-06, |
| "loss": 0.4373, |
| "step": 13610 |
| }, |
| { |
| "epoch": 0.665542769185663, |
| "grad_norm": 0.40097367763519287, |
| "learning_rate": 3.038819109865495e-06, |
| "loss": 0.437, |
| "step": 13620 |
| }, |
| { |
| "epoch": 0.6660314202643602, |
| "grad_norm": 0.3417106866836548, |
| "learning_rate": 3.0309768160165697e-06, |
| "loss": 0.4367, |
| "step": 13630 |
| }, |
| { |
| "epoch": 0.6665200713430575, |
| "grad_norm": 0.8929291367530823, |
| "learning_rate": 3.0231402509923347e-06, |
| "loss": 0.4371, |
| "step": 13640 |
| }, |
| { |
| "epoch": 0.6670087224217548, |
| "grad_norm": 0.5947937965393066, |
| "learning_rate": 3.015309437593084e-06, |
| "loss": 0.4381, |
| "step": 13650 |
| }, |
| { |
| "epoch": 0.667497373500452, |
| "grad_norm": 0.42644399404525757, |
| "learning_rate": 3.00748439860238e-06, |
| "loss": 0.4368, |
| "step": 13660 |
| }, |
| { |
| "epoch": 0.6679860245791492, |
| "grad_norm": 0.378689706325531, |
| "learning_rate": 2.9996651567869784e-06, |
| "loss": 0.4358, |
| "step": 13670 |
| }, |
| { |
| "epoch": 0.6684746756578465, |
| "grad_norm": 0.5530552268028259, |
| "learning_rate": 2.9918517348967734e-06, |
| "loss": 0.4376, |
| "step": 13680 |
| }, |
| { |
| "epoch": 0.6689633267365438, |
| "grad_norm": 0.5646296739578247, |
| "learning_rate": 2.9840441556647247e-06, |
| "loss": 0.4371, |
| "step": 13690 |
| }, |
| { |
| "epoch": 0.669451977815241, |
| "grad_norm": 0.7569136619567871, |
| "learning_rate": 2.9762424418067905e-06, |
| "loss": 0.4373, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.6699406288939382, |
| "grad_norm": 1.0428658723831177, |
| "learning_rate": 2.968446616021868e-06, |
| "loss": 0.4379, |
| "step": 13710 |
| }, |
| { |
| "epoch": 0.6704292799726356, |
| "grad_norm": 0.7594118714332581, |
| "learning_rate": 2.9606567009917218e-06, |
| "loss": 0.4375, |
| "step": 13720 |
| }, |
| { |
| "epoch": 0.6709179310513328, |
| "grad_norm": 2.9701902866363525, |
| "learning_rate": 2.952872719380917e-06, |
| "loss": 0.4379, |
| "step": 13730 |
| }, |
| { |
| "epoch": 0.67140658213003, |
| "grad_norm": 0.9236831665039062, |
| "learning_rate": 2.94509469383676e-06, |
| "loss": 0.4371, |
| "step": 13740 |
| }, |
| { |
| "epoch": 0.6718952332087273, |
| "grad_norm": 1.1951662302017212, |
| "learning_rate": 2.9373226469892223e-06, |
| "loss": 0.4377, |
| "step": 13750 |
| }, |
| { |
| "epoch": 0.6723838842874246, |
| "grad_norm": 0.3135634958744049, |
| "learning_rate": 2.9295566014508853e-06, |
| "loss": 0.4369, |
| "step": 13760 |
| }, |
| { |
| "epoch": 0.6728725353661218, |
| "grad_norm": 0.4447099566459656, |
| "learning_rate": 2.9217965798168685e-06, |
| "loss": 0.4376, |
| "step": 13770 |
| }, |
| { |
| "epoch": 0.6733611864448191, |
| "grad_norm": 0.3684927821159363, |
| "learning_rate": 2.914042604664764e-06, |
| "loss": 0.4373, |
| "step": 13780 |
| }, |
| { |
| "epoch": 0.6738498375235163, |
| "grad_norm": 0.3362119495868683, |
| "learning_rate": 2.9062946985545707e-06, |
| "loss": 0.4371, |
| "step": 13790 |
| }, |
| { |
| "epoch": 0.6743384886022136, |
| "grad_norm": 0.6925981640815735, |
| "learning_rate": 2.898552884028634e-06, |
| "loss": 0.4371, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.6748271396809108, |
| "grad_norm": 0.49009522795677185, |
| "learning_rate": 2.8908171836115736e-06, |
| "loss": 0.4382, |
| "step": 13810 |
| }, |
| { |
| "epoch": 0.6753157907596081, |
| "grad_norm": 0.49105721712112427, |
| "learning_rate": 2.8830876198102176e-06, |
| "loss": 0.4369, |
| "step": 13820 |
| }, |
| { |
| "epoch": 0.6758044418383053, |
| "grad_norm": 0.5330390930175781, |
| "learning_rate": 2.875364215113547e-06, |
| "loss": 0.4365, |
| "step": 13830 |
| }, |
| { |
| "epoch": 0.6762930929170026, |
| "grad_norm": 0.43516021966934204, |
| "learning_rate": 2.8676469919926152e-06, |
| "loss": 0.437, |
| "step": 13840 |
| }, |
| { |
| "epoch": 0.6767817439956999, |
| "grad_norm": 0.4716795086860657, |
| "learning_rate": 2.859935972900492e-06, |
| "loss": 0.4361, |
| "step": 13850 |
| }, |
| { |
| "epoch": 0.6772703950743971, |
| "grad_norm": 0.38898736238479614, |
| "learning_rate": 2.8522311802722038e-06, |
| "loss": 0.4369, |
| "step": 13860 |
| }, |
| { |
| "epoch": 0.6777590461530943, |
| "grad_norm": 0.34487384557724, |
| "learning_rate": 2.8445326365246516e-06, |
| "loss": 0.4381, |
| "step": 13870 |
| }, |
| { |
| "epoch": 0.6782476972317917, |
| "grad_norm": 0.30314865708351135, |
| "learning_rate": 2.836840364056559e-06, |
| "loss": 0.4371, |
| "step": 13880 |
| }, |
| { |
| "epoch": 0.6787363483104889, |
| "grad_norm": 0.5969054102897644, |
| "learning_rate": 2.829154385248409e-06, |
| "loss": 0.4367, |
| "step": 13890 |
| }, |
| { |
| "epoch": 0.6792249993891861, |
| "grad_norm": 0.32903793454170227, |
| "learning_rate": 2.8214747224623627e-06, |
| "loss": 0.4357, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.6797136504678835, |
| "grad_norm": 0.3053576648235321, |
| "learning_rate": 2.8138013980422164e-06, |
| "loss": 0.4365, |
| "step": 13910 |
| }, |
| { |
| "epoch": 0.6802023015465807, |
| "grad_norm": 0.43716076016426086, |
| "learning_rate": 2.8061344343133144e-06, |
| "loss": 0.4364, |
| "step": 13920 |
| }, |
| { |
| "epoch": 0.6806909526252779, |
| "grad_norm": 0.4108024537563324, |
| "learning_rate": 2.7984738535825044e-06, |
| "loss": 0.4379, |
| "step": 13930 |
| }, |
| { |
| "epoch": 0.6811796037039751, |
| "grad_norm": 0.45622798800468445, |
| "learning_rate": 2.790819678138056e-06, |
| "loss": 0.4368, |
| "step": 13940 |
| }, |
| { |
| "epoch": 0.6816682547826725, |
| "grad_norm": 0.416840523481369, |
| "learning_rate": 2.783171930249603e-06, |
| "loss": 0.4374, |
| "step": 13950 |
| }, |
| { |
| "epoch": 0.6821569058613697, |
| "grad_norm": 0.5477868914604187, |
| "learning_rate": 2.775530632168084e-06, |
| "loss": 0.4372, |
| "step": 13960 |
| }, |
| { |
| "epoch": 0.6826455569400669, |
| "grad_norm": 0.3617335259914398, |
| "learning_rate": 2.7678958061256667e-06, |
| "loss": 0.4363, |
| "step": 13970 |
| }, |
| { |
| "epoch": 0.6831342080187642, |
| "grad_norm": 0.45384445786476135, |
| "learning_rate": 2.7602674743356893e-06, |
| "loss": 0.4349, |
| "step": 13980 |
| }, |
| { |
| "epoch": 0.6836228590974615, |
| "grad_norm": 0.3232516944408417, |
| "learning_rate": 2.752645658992599e-06, |
| "loss": 0.4369, |
| "step": 13990 |
| }, |
| { |
| "epoch": 0.6841115101761587, |
| "grad_norm": 0.4313335418701172, |
| "learning_rate": 2.745030382271879e-06, |
| "loss": 0.4378, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.6841115101761587, |
| "eval_loss": 0.41417059302330017, |
| "eval_runtime": 727.8695, |
| "eval_samples_per_second": 243.049, |
| "eval_steps_per_second": 0.475, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.684600161254856, |
| "grad_norm": 0.5871222615242004, |
| "learning_rate": 2.737421666329987e-06, |
| "loss": 0.4373, |
| "step": 14010 |
| }, |
| { |
| "epoch": 0.6850888123335532, |
| "grad_norm": 0.4553307294845581, |
| "learning_rate": 2.7298195333043022e-06, |
| "loss": 0.4372, |
| "step": 14020 |
| }, |
| { |
| "epoch": 0.6855774634122505, |
| "grad_norm": 0.49893367290496826, |
| "learning_rate": 2.722224005313041e-06, |
| "loss": 0.4366, |
| "step": 14030 |
| }, |
| { |
| "epoch": 0.6860661144909478, |
| "grad_norm": 0.401821494102478, |
| "learning_rate": 2.7146351044552045e-06, |
| "loss": 0.4372, |
| "step": 14040 |
| }, |
| { |
| "epoch": 0.686554765569645, |
| "grad_norm": 0.2880902588367462, |
| "learning_rate": 2.7070528528105165e-06, |
| "loss": 0.4366, |
| "step": 14050 |
| }, |
| { |
| "epoch": 0.6870434166483422, |
| "grad_norm": 0.4244653880596161, |
| "learning_rate": 2.6994772724393516e-06, |
| "loss": 0.4368, |
| "step": 14060 |
| }, |
| { |
| "epoch": 0.6875320677270395, |
| "grad_norm": 0.4931180775165558, |
| "learning_rate": 2.6919083853826724e-06, |
| "loss": 0.4371, |
| "step": 14070 |
| }, |
| { |
| "epoch": 0.6880207188057368, |
| "grad_norm": 0.5409946441650391, |
| "learning_rate": 2.684346213661974e-06, |
| "loss": 0.4363, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.688509369884434, |
| "grad_norm": 0.4695432484149933, |
| "learning_rate": 2.676790779279209e-06, |
| "loss": 0.4369, |
| "step": 14090 |
| }, |
| { |
| "epoch": 0.6889980209631312, |
| "grad_norm": 4.034379005432129, |
| "learning_rate": 2.669242104216725e-06, |
| "loss": 0.4363, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.6894866720418286, |
| "grad_norm": 0.6742619872093201, |
| "learning_rate": 2.6617002104372096e-06, |
| "loss": 0.4373, |
| "step": 14110 |
| }, |
| { |
| "epoch": 0.6899753231205258, |
| "grad_norm": 0.6923062801361084, |
| "learning_rate": 2.6541651198836207e-06, |
| "loss": 0.4365, |
| "step": 14120 |
| }, |
| { |
| "epoch": 0.690463974199223, |
| "grad_norm": 0.6054366230964661, |
| "learning_rate": 2.6466368544791164e-06, |
| "loss": 0.4364, |
| "step": 14130 |
| }, |
| { |
| "epoch": 0.6909526252779203, |
| "grad_norm": 0.809479296207428, |
| "learning_rate": 2.639115436126999e-06, |
| "loss": 0.4358, |
| "step": 14140 |
| }, |
| { |
| "epoch": 0.6914412763566176, |
| "grad_norm": 0.458893358707428, |
| "learning_rate": 2.6316008867106547e-06, |
| "loss": 0.4365, |
| "step": 14150 |
| }, |
| { |
| "epoch": 0.6919299274353148, |
| "grad_norm": 1.5249381065368652, |
| "learning_rate": 2.6240932280934794e-06, |
| "loss": 0.4353, |
| "step": 14160 |
| }, |
| { |
| "epoch": 0.6924185785140121, |
| "grad_norm": 0.435376912355423, |
| "learning_rate": 2.616592482118818e-06, |
| "loss": 0.4358, |
| "step": 14170 |
| }, |
| { |
| "epoch": 0.6929072295927093, |
| "grad_norm": 0.5011893510818481, |
| "learning_rate": 2.6090986706099135e-06, |
| "loss": 0.4361, |
| "step": 14180 |
| }, |
| { |
| "epoch": 0.6933958806714066, |
| "grad_norm": 0.42486095428466797, |
| "learning_rate": 2.6016118153698235e-06, |
| "loss": 0.4374, |
| "step": 14190 |
| }, |
| { |
| "epoch": 0.6938845317501038, |
| "grad_norm": 0.29725852608680725, |
| "learning_rate": 2.594131938181368e-06, |
| "loss": 0.4367, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.6943731828288011, |
| "grad_norm": 1.0349030494689941, |
| "learning_rate": 2.586659060807068e-06, |
| "loss": 0.4382, |
| "step": 14210 |
| }, |
| { |
| "epoch": 0.6948618339074983, |
| "grad_norm": 0.3708353340625763, |
| "learning_rate": 2.579193204989079e-06, |
| "loss": 0.4373, |
| "step": 14220 |
| }, |
| { |
| "epoch": 0.6953504849861956, |
| "grad_norm": 0.4205668568611145, |
| "learning_rate": 2.5717343924491224e-06, |
| "loss": 0.4362, |
| "step": 14230 |
| }, |
| { |
| "epoch": 0.6958391360648929, |
| "grad_norm": 0.6266738772392273, |
| "learning_rate": 2.564282644888434e-06, |
| "loss": 0.4372, |
| "step": 14240 |
| }, |
| { |
| "epoch": 0.6963277871435901, |
| "grad_norm": 0.43474340438842773, |
| "learning_rate": 2.5568379839876883e-06, |
| "loss": 0.4359, |
| "step": 14250 |
| }, |
| { |
| "epoch": 0.6968164382222873, |
| "grad_norm": 0.7086150646209717, |
| "learning_rate": 2.5494004314069422e-06, |
| "loss": 0.4357, |
| "step": 14260 |
| }, |
| { |
| "epoch": 0.6973050893009847, |
| "grad_norm": 0.6918942332267761, |
| "learning_rate": 2.5419700087855765e-06, |
| "loss": 0.4358, |
| "step": 14270 |
| }, |
| { |
| "epoch": 0.6977937403796819, |
| "grad_norm": 0.7701777219772339, |
| "learning_rate": 2.5345467377422216e-06, |
| "loss": 0.4369, |
| "step": 14280 |
| }, |
| { |
| "epoch": 0.6982823914583791, |
| "grad_norm": 0.40936869382858276, |
| "learning_rate": 2.527130639874701e-06, |
| "loss": 0.4364, |
| "step": 14290 |
| }, |
| { |
| "epoch": 0.6987710425370764, |
| "grad_norm": 0.432035356760025, |
| "learning_rate": 2.5197217367599726e-06, |
| "loss": 0.4366, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.6992596936157737, |
| "grad_norm": 0.41449683904647827, |
| "learning_rate": 2.512320049954056e-06, |
| "loss": 0.4359, |
| "step": 14310 |
| }, |
| { |
| "epoch": 0.6997483446944709, |
| "grad_norm": 0.49594905972480774, |
| "learning_rate": 2.50492560099198e-06, |
| "loss": 0.4364, |
| "step": 14320 |
| }, |
| { |
| "epoch": 0.7002369957731681, |
| "grad_norm": 0.38190391659736633, |
| "learning_rate": 2.4975384113877093e-06, |
| "loss": 0.4362, |
| "step": 14330 |
| }, |
| { |
| "epoch": 0.7007256468518654, |
| "grad_norm": 0.8239844441413879, |
| "learning_rate": 2.490158502634095e-06, |
| "loss": 0.4361, |
| "step": 14340 |
| }, |
| { |
| "epoch": 0.7012142979305627, |
| "grad_norm": 0.5367412567138672, |
| "learning_rate": 2.4827858962027994e-06, |
| "loss": 0.4355, |
| "step": 14350 |
| }, |
| { |
| "epoch": 0.7017029490092599, |
| "grad_norm": 0.4261118173599243, |
| "learning_rate": 2.475420613544237e-06, |
| "loss": 0.4357, |
| "step": 14360 |
| }, |
| { |
| "epoch": 0.7021916000879572, |
| "grad_norm": 0.7066627144813538, |
| "learning_rate": 2.468062676087522e-06, |
| "loss": 0.4379, |
| "step": 14370 |
| }, |
| { |
| "epoch": 0.7026802511666544, |
| "grad_norm": 0.7751229405403137, |
| "learning_rate": 2.4607121052403903e-06, |
| "loss": 0.4358, |
| "step": 14380 |
| }, |
| { |
| "epoch": 0.7031689022453517, |
| "grad_norm": 0.3944869041442871, |
| "learning_rate": 2.4533689223891466e-06, |
| "loss": 0.4371, |
| "step": 14390 |
| }, |
| { |
| "epoch": 0.703657553324049, |
| "grad_norm": 0.5122698545455933, |
| "learning_rate": 2.446033148898605e-06, |
| "loss": 0.4361, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.7041462044027462, |
| "grad_norm": 0.4192598760128021, |
| "learning_rate": 2.438704806112016e-06, |
| "loss": 0.4361, |
| "step": 14410 |
| }, |
| { |
| "epoch": 0.7046348554814434, |
| "grad_norm": 0.4704056680202484, |
| "learning_rate": 2.4313839153510112e-06, |
| "loss": 0.4359, |
| "step": 14420 |
| }, |
| { |
| "epoch": 0.7051235065601408, |
| "grad_norm": 0.3789515197277069, |
| "learning_rate": 2.4240704979155484e-06, |
| "loss": 0.436, |
| "step": 14430 |
| }, |
| { |
| "epoch": 0.705612157638838, |
| "grad_norm": 0.48638996481895447, |
| "learning_rate": 2.4167645750838336e-06, |
| "loss": 0.4366, |
| "step": 14440 |
| }, |
| { |
| "epoch": 0.7061008087175352, |
| "grad_norm": 0.3896729052066803, |
| "learning_rate": 2.4094661681122684e-06, |
| "loss": 0.4372, |
| "step": 14450 |
| }, |
| { |
| "epoch": 0.7065894597962326, |
| "grad_norm": 0.5547624826431274, |
| "learning_rate": 2.4021752982353918e-06, |
| "loss": 0.435, |
| "step": 14460 |
| }, |
| { |
| "epoch": 0.7070781108749298, |
| "grad_norm": 0.4325717091560364, |
| "learning_rate": 2.394891986665811e-06, |
| "loss": 0.4353, |
| "step": 14470 |
| }, |
| { |
| "epoch": 0.707566761953627, |
| "grad_norm": 0.46477776765823364, |
| "learning_rate": 2.387616254594139e-06, |
| "loss": 0.4372, |
| "step": 14480 |
| }, |
| { |
| "epoch": 0.7080554130323242, |
| "grad_norm": 0.39680078625679016, |
| "learning_rate": 2.3803481231889443e-06, |
| "loss": 0.4359, |
| "step": 14490 |
| }, |
| { |
| "epoch": 0.7085440641110216, |
| "grad_norm": 0.34461086988449097, |
| "learning_rate": 2.3730876135966746e-06, |
| "loss": 0.4377, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7085440641110216, |
| "eval_loss": 0.4154199957847595, |
| "eval_runtime": 729.053, |
| "eval_samples_per_second": 242.654, |
| "eval_steps_per_second": 0.475, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7090327151897188, |
| "grad_norm": 0.4224153757095337, |
| "learning_rate": 2.3658347469416037e-06, |
| "loss": 0.4366, |
| "step": 14510 |
| }, |
| { |
| "epoch": 0.709521366268416, |
| "grad_norm": 0.32037585973739624, |
| "learning_rate": 2.3585895443257705e-06, |
| "loss": 0.4364, |
| "step": 14520 |
| }, |
| { |
| "epoch": 0.7100100173471133, |
| "grad_norm": 0.6405905485153198, |
| "learning_rate": 2.351352026828917e-06, |
| "loss": 0.4359, |
| "step": 14530 |
| }, |
| { |
| "epoch": 0.7104986684258106, |
| "grad_norm": 0.4093703627586365, |
| "learning_rate": 2.3441222155084196e-06, |
| "loss": 0.4372, |
| "step": 14540 |
| }, |
| { |
| "epoch": 0.7109873195045078, |
| "grad_norm": 0.31071528792381287, |
| "learning_rate": 2.3369001313992373e-06, |
| "loss": 0.4367, |
| "step": 14550 |
| }, |
| { |
| "epoch": 0.7114759705832051, |
| "grad_norm": 0.502044141292572, |
| "learning_rate": 2.3296857955138493e-06, |
| "loss": 0.4365, |
| "step": 14560 |
| }, |
| { |
| "epoch": 0.7119646216619023, |
| "grad_norm": 0.5427960753440857, |
| "learning_rate": 2.3224792288421873e-06, |
| "loss": 0.4372, |
| "step": 14570 |
| }, |
| { |
| "epoch": 0.7124532727405996, |
| "grad_norm": 0.6338086128234863, |
| "learning_rate": 2.3152804523515787e-06, |
| "loss": 0.4358, |
| "step": 14580 |
| }, |
| { |
| "epoch": 0.7129419238192969, |
| "grad_norm": 0.36875244975090027, |
| "learning_rate": 2.3080894869866906e-06, |
| "loss": 0.436, |
| "step": 14590 |
| }, |
| { |
| "epoch": 0.7134305748979941, |
| "grad_norm": 0.39585214853286743, |
| "learning_rate": 2.3009063536694588e-06, |
| "loss": 0.4334, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.7139192259766913, |
| "grad_norm": 0.4556538164615631, |
| "learning_rate": 2.293731073299032e-06, |
| "loss": 0.4367, |
| "step": 14610 |
| }, |
| { |
| "epoch": 0.7144078770553886, |
| "grad_norm": 0.4585372507572174, |
| "learning_rate": 2.286563666751714e-06, |
| "loss": 0.4344, |
| "step": 14620 |
| }, |
| { |
| "epoch": 0.7148965281340859, |
| "grad_norm": 0.3792722821235657, |
| "learning_rate": 2.2794041548809013e-06, |
| "loss": 0.4372, |
| "step": 14630 |
| }, |
| { |
| "epoch": 0.7153851792127831, |
| "grad_norm": 0.5071465969085693, |
| "learning_rate": 2.2722525585170136e-06, |
| "loss": 0.437, |
| "step": 14640 |
| }, |
| { |
| "epoch": 0.7158738302914803, |
| "grad_norm": 0.47391828894615173, |
| "learning_rate": 2.265108898467449e-06, |
| "loss": 0.4361, |
| "step": 14650 |
| }, |
| { |
| "epoch": 0.7163624813701777, |
| "grad_norm": 0.450090229511261, |
| "learning_rate": 2.2579731955165098e-06, |
| "loss": 0.435, |
| "step": 14660 |
| }, |
| { |
| "epoch": 0.7168511324488749, |
| "grad_norm": 0.4352344870567322, |
| "learning_rate": 2.250845470425346e-06, |
| "loss": 0.4358, |
| "step": 14670 |
| }, |
| { |
| "epoch": 0.7173397835275721, |
| "grad_norm": 1.0980722904205322, |
| "learning_rate": 2.2437257439319045e-06, |
| "loss": 0.4349, |
| "step": 14680 |
| }, |
| { |
| "epoch": 0.7178284346062694, |
| "grad_norm": 0.7365118265151978, |
| "learning_rate": 2.2366140367508515e-06, |
| "loss": 0.436, |
| "step": 14690 |
| }, |
| { |
| "epoch": 0.7183170856849667, |
| "grad_norm": 0.3632850646972656, |
| "learning_rate": 2.2295103695735237e-06, |
| "loss": 0.437, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.7188057367636639, |
| "grad_norm": 0.4772653877735138, |
| "learning_rate": 2.2224147630678698e-06, |
| "loss": 0.434, |
| "step": 14710 |
| }, |
| { |
| "epoch": 0.7192943878423612, |
| "grad_norm": 0.533318042755127, |
| "learning_rate": 2.2153272378783823e-06, |
| "loss": 0.4348, |
| "step": 14720 |
| }, |
| { |
| "epoch": 0.7197830389210584, |
| "grad_norm": 0.649156928062439, |
| "learning_rate": 2.2082478146260394e-06, |
| "loss": 0.4354, |
| "step": 14730 |
| }, |
| { |
| "epoch": 0.7202716899997557, |
| "grad_norm": 0.5530617833137512, |
| "learning_rate": 2.2011765139082514e-06, |
| "loss": 0.436, |
| "step": 14740 |
| }, |
| { |
| "epoch": 0.7207603410784529, |
| "grad_norm": 0.48404207825660706, |
| "learning_rate": 2.194113356298796e-06, |
| "loss": 0.4359, |
| "step": 14750 |
| }, |
| { |
| "epoch": 0.7212489921571502, |
| "grad_norm": 0.6402378082275391, |
| "learning_rate": 2.1870583623477554e-06, |
| "loss": 0.4366, |
| "step": 14760 |
| }, |
| { |
| "epoch": 0.7217376432358474, |
| "grad_norm": 0.4514593183994293, |
| "learning_rate": 2.1800115525814604e-06, |
| "loss": 0.4347, |
| "step": 14770 |
| }, |
| { |
| "epoch": 0.7222262943145447, |
| "grad_norm": 0.4350273013114929, |
| "learning_rate": 2.1729729475024337e-06, |
| "loss": 0.437, |
| "step": 14780 |
| }, |
| { |
| "epoch": 0.722714945393242, |
| "grad_norm": 0.7733496427536011, |
| "learning_rate": 2.165942567589324e-06, |
| "loss": 0.4362, |
| "step": 14790 |
| }, |
| { |
| "epoch": 0.7232035964719392, |
| "grad_norm": 0.3570731282234192, |
| "learning_rate": 2.158920433296846e-06, |
| "loss": 0.435, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.7236922475506364, |
| "grad_norm": 0.45792272686958313, |
| "learning_rate": 2.151906565055732e-06, |
| "loss": 0.4359, |
| "step": 14810 |
| }, |
| { |
| "epoch": 0.7241808986293338, |
| "grad_norm": 0.3383428454399109, |
| "learning_rate": 2.1449009832726576e-06, |
| "loss": 0.4367, |
| "step": 14820 |
| }, |
| { |
| "epoch": 0.724669549708031, |
| "grad_norm": 0.4315878450870514, |
| "learning_rate": 2.137903708330188e-06, |
| "loss": 0.4359, |
| "step": 14830 |
| }, |
| { |
| "epoch": 0.7251582007867282, |
| "grad_norm": 0.5013752579689026, |
| "learning_rate": 2.130914760586729e-06, |
| "loss": 0.4346, |
| "step": 14840 |
| }, |
| { |
| "epoch": 0.7256468518654255, |
| "grad_norm": 0.5946633815765381, |
| "learning_rate": 2.1239341603764506e-06, |
| "loss": 0.4355, |
| "step": 14850 |
| }, |
| { |
| "epoch": 0.7261355029441228, |
| "grad_norm": 1.4556235074996948, |
| "learning_rate": 2.1169619280092362e-06, |
| "loss": 0.4352, |
| "step": 14860 |
| }, |
| { |
| "epoch": 0.72662415402282, |
| "grad_norm": 0.49753642082214355, |
| "learning_rate": 2.109998083770628e-06, |
| "loss": 0.4369, |
| "step": 14870 |
| }, |
| { |
| "epoch": 0.7271128051015172, |
| "grad_norm": 0.3729608654975891, |
| "learning_rate": 2.103042647921758e-06, |
| "loss": 0.4356, |
| "step": 14880 |
| }, |
| { |
| "epoch": 0.7276014561802145, |
| "grad_norm": 0.39122653007507324, |
| "learning_rate": 2.096095640699295e-06, |
| "loss": 0.4368, |
| "step": 14890 |
| }, |
| { |
| "epoch": 0.7280901072589118, |
| "grad_norm": 0.42691490054130554, |
| "learning_rate": 2.08915708231539e-06, |
| "loss": 0.4357, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.728578758337609, |
| "grad_norm": 0.38435041904449463, |
| "learning_rate": 2.0822269929576066e-06, |
| "loss": 0.4363, |
| "step": 14910 |
| }, |
| { |
| "epoch": 0.7290674094163063, |
| "grad_norm": 0.8433852195739746, |
| "learning_rate": 2.075305392788868e-06, |
| "loss": 0.4366, |
| "step": 14920 |
| }, |
| { |
| "epoch": 0.7295560604950035, |
| "grad_norm": 0.5046951174736023, |
| "learning_rate": 2.0683923019474016e-06, |
| "loss": 0.4358, |
| "step": 14930 |
| }, |
| { |
| "epoch": 0.7300447115737008, |
| "grad_norm": 0.9538094401359558, |
| "learning_rate": 2.061487740546679e-06, |
| "loss": 0.4358, |
| "step": 14940 |
| }, |
| { |
| "epoch": 0.7305333626523981, |
| "grad_norm": 0.542107343673706, |
| "learning_rate": 2.0545917286753494e-06, |
| "loss": 0.437, |
| "step": 14950 |
| }, |
| { |
| "epoch": 0.7310220137310953, |
| "grad_norm": 0.2896505296230316, |
| "learning_rate": 2.047704286397188e-06, |
| "loss": 0.4355, |
| "step": 14960 |
| }, |
| { |
| "epoch": 0.7315106648097925, |
| "grad_norm": 0.43803542852401733, |
| "learning_rate": 2.040825433751044e-06, |
| "loss": 0.4363, |
| "step": 14970 |
| }, |
| { |
| "epoch": 0.7319993158884899, |
| "grad_norm": 0.5424397587776184, |
| "learning_rate": 2.0339551907507687e-06, |
| "loss": 0.4366, |
| "step": 14980 |
| }, |
| { |
| "epoch": 0.7324879669671871, |
| "grad_norm": 0.5848090648651123, |
| "learning_rate": 2.027093577385163e-06, |
| "loss": 0.4349, |
| "step": 14990 |
| }, |
| { |
| "epoch": 0.7329766180458843, |
| "grad_norm": 0.3782629072666168, |
| "learning_rate": 2.0202406136179275e-06, |
| "loss": 0.4372, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7329766180458843, |
| "eval_loss": 0.4146045744419098, |
| "eval_runtime": 728.9472, |
| "eval_samples_per_second": 242.69, |
| "eval_steps_per_second": 0.475, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7334652691245815, |
| "grad_norm": 0.27179810404777527, |
| "learning_rate": 2.01339631938759e-06, |
| "loss": 0.4349, |
| "step": 15010 |
| }, |
| { |
| "epoch": 0.7339539202032789, |
| "grad_norm": 0.6157824397087097, |
| "learning_rate": 2.006560714607455e-06, |
| "loss": 0.436, |
| "step": 15020 |
| }, |
| { |
| "epoch": 0.7344425712819761, |
| "grad_norm": 0.38568001985549927, |
| "learning_rate": 1.99973381916555e-06, |
| "loss": 0.4353, |
| "step": 15030 |
| }, |
| { |
| "epoch": 0.7349312223606733, |
| "grad_norm": 0.3673468232154846, |
| "learning_rate": 1.992915652924558e-06, |
| "loss": 0.4365, |
| "step": 15040 |
| }, |
| { |
| "epoch": 0.7354198734393707, |
| "grad_norm": 0.4711572229862213, |
| "learning_rate": 1.986106235721769e-06, |
| "loss": 0.4348, |
| "step": 15050 |
| }, |
| { |
| "epoch": 0.7359085245180679, |
| "grad_norm": 0.30081677436828613, |
| "learning_rate": 1.9793055873690115e-06, |
| "loss": 0.4361, |
| "step": 15060 |
| }, |
| { |
| "epoch": 0.7363971755967651, |
| "grad_norm": 0.49421292543411255, |
| "learning_rate": 1.9725137276526098e-06, |
| "loss": 0.436, |
| "step": 15070 |
| }, |
| { |
| "epoch": 0.7368858266754624, |
| "grad_norm": 0.4806350767612457, |
| "learning_rate": 1.965730676333309e-06, |
| "loss": 0.4352, |
| "step": 15080 |
| }, |
| { |
| "epoch": 0.7373744777541597, |
| "grad_norm": 0.7303268909454346, |
| "learning_rate": 1.9589564531462344e-06, |
| "loss": 0.4351, |
| "step": 15090 |
| }, |
| { |
| "epoch": 0.7378631288328569, |
| "grad_norm": 0.3639063537120819, |
| "learning_rate": 1.952191077800821e-06, |
| "loss": 0.4361, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.7383517799115542, |
| "grad_norm": 0.3184981048107147, |
| "learning_rate": 1.94543456998076e-06, |
| "loss": 0.4361, |
| "step": 15110 |
| }, |
| { |
| "epoch": 0.7388404309902514, |
| "grad_norm": 0.4460330605506897, |
| "learning_rate": 1.9386869493439485e-06, |
| "loss": 0.4367, |
| "step": 15120 |
| }, |
| { |
| "epoch": 0.7393290820689487, |
| "grad_norm": 0.2961271107196808, |
| "learning_rate": 1.9319482355224235e-06, |
| "loss": 0.435, |
| "step": 15130 |
| }, |
| { |
| "epoch": 0.7398177331476459, |
| "grad_norm": 0.4846443235874176, |
| "learning_rate": 1.9252184481223033e-06, |
| "loss": 0.4354, |
| "step": 15140 |
| }, |
| { |
| "epoch": 0.7403063842263432, |
| "grad_norm": 0.35571032762527466, |
| "learning_rate": 1.918497606723744e-06, |
| "loss": 0.436, |
| "step": 15150 |
| }, |
| { |
| "epoch": 0.7407950353050404, |
| "grad_norm": 0.5735732913017273, |
| "learning_rate": 1.9117857308808687e-06, |
| "loss": 0.4358, |
| "step": 15160 |
| }, |
| { |
| "epoch": 0.7412836863837376, |
| "grad_norm": 0.5794824361801147, |
| "learning_rate": 1.9050828401217142e-06, |
| "loss": 0.436, |
| "step": 15170 |
| }, |
| { |
| "epoch": 0.741772337462435, |
| "grad_norm": 0.25915631651878357, |
| "learning_rate": 1.8983889539481754e-06, |
| "loss": 0.4357, |
| "step": 15180 |
| }, |
| { |
| "epoch": 0.7422609885411322, |
| "grad_norm": 0.582955002784729, |
| "learning_rate": 1.891704091835953e-06, |
| "loss": 0.4368, |
| "step": 15190 |
| }, |
| { |
| "epoch": 0.7427496396198294, |
| "grad_norm": 0.42489010095596313, |
| "learning_rate": 1.8850282732344887e-06, |
| "loss": 0.4354, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.7432382906985268, |
| "grad_norm": 0.31416329741477966, |
| "learning_rate": 1.8783615175669106e-06, |
| "loss": 0.4354, |
| "step": 15210 |
| }, |
| { |
| "epoch": 0.743726941777224, |
| "grad_norm": 4.887961387634277, |
| "learning_rate": 1.871703844229985e-06, |
| "loss": 0.4385, |
| "step": 15220 |
| }, |
| { |
| "epoch": 0.7442155928559212, |
| "grad_norm": 1.1010756492614746, |
| "learning_rate": 1.8650552725940468e-06, |
| "loss": 0.4357, |
| "step": 15230 |
| }, |
| { |
| "epoch": 0.7447042439346185, |
| "grad_norm": 0.46031710505485535, |
| "learning_rate": 1.8584158220029514e-06, |
| "loss": 0.4363, |
| "step": 15240 |
| }, |
| { |
| "epoch": 0.7451928950133158, |
| "grad_norm": 0.8031518459320068, |
| "learning_rate": 1.851785511774018e-06, |
| "loss": 0.4355, |
| "step": 15250 |
| }, |
| { |
| "epoch": 0.745681546092013, |
| "grad_norm": 0.33631330728530884, |
| "learning_rate": 1.8451643611979746e-06, |
| "loss": 0.4358, |
| "step": 15260 |
| }, |
| { |
| "epoch": 0.7461701971707102, |
| "grad_norm": 0.3979465365409851, |
| "learning_rate": 1.838552389538894e-06, |
| "loss": 0.4353, |
| "step": 15270 |
| }, |
| { |
| "epoch": 0.7466588482494075, |
| "grad_norm": 0.5838291049003601, |
| "learning_rate": 1.831949616034145e-06, |
| "loss": 0.4369, |
| "step": 15280 |
| }, |
| { |
| "epoch": 0.7471474993281048, |
| "grad_norm": 0.34562933444976807, |
| "learning_rate": 1.8253560598943377e-06, |
| "loss": 0.4373, |
| "step": 15290 |
| }, |
| { |
| "epoch": 0.747636150406802, |
| "grad_norm": 0.42259690165519714, |
| "learning_rate": 1.81877174030326e-06, |
| "loss": 0.436, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.7481248014854993, |
| "grad_norm": 0.47943800687789917, |
| "learning_rate": 1.8121966764178278e-06, |
| "loss": 0.4341, |
| "step": 15310 |
| }, |
| { |
| "epoch": 0.7486134525641965, |
| "grad_norm": 0.4682493805885315, |
| "learning_rate": 1.8056308873680316e-06, |
| "loss": 0.4361, |
| "step": 15320 |
| }, |
| { |
| "epoch": 0.7491021036428938, |
| "grad_norm": 0.5536458492279053, |
| "learning_rate": 1.7990743922568699e-06, |
| "loss": 0.4359, |
| "step": 15330 |
| }, |
| { |
| "epoch": 0.7495907547215911, |
| "grad_norm": 0.3631746768951416, |
| "learning_rate": 1.7925272101603076e-06, |
| "loss": 0.4358, |
| "step": 15340 |
| }, |
| { |
| "epoch": 0.7500794058002883, |
| "grad_norm": 0.480092853307724, |
| "learning_rate": 1.7859893601272077e-06, |
| "loss": 0.4362, |
| "step": 15350 |
| }, |
| { |
| "epoch": 0.7505680568789855, |
| "grad_norm": 0.4252304434776306, |
| "learning_rate": 1.7794608611792873e-06, |
| "loss": 0.4339, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7510567079576829, |
| "grad_norm": 0.37599998712539673, |
| "learning_rate": 1.772941732311052e-06, |
| "loss": 0.4346, |
| "step": 15370 |
| }, |
| { |
| "epoch": 0.7515453590363801, |
| "grad_norm": 0.5096463561058044, |
| "learning_rate": 1.7664319924897493e-06, |
| "loss": 0.4361, |
| "step": 15380 |
| }, |
| { |
| "epoch": 0.7520340101150773, |
| "grad_norm": 0.402937650680542, |
| "learning_rate": 1.7599316606553074e-06, |
| "loss": 0.4345, |
| "step": 15390 |
| }, |
| { |
| "epoch": 0.7525226611937745, |
| "grad_norm": 0.5899362564086914, |
| "learning_rate": 1.75344075572028e-06, |
| "loss": 0.4354, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.7530113122724719, |
| "grad_norm": 0.6552911996841431, |
| "learning_rate": 1.7469592965697985e-06, |
| "loss": 0.4367, |
| "step": 15410 |
| }, |
| { |
| "epoch": 0.7534999633511691, |
| "grad_norm": 0.34461089968681335, |
| "learning_rate": 1.7404873020615092e-06, |
| "loss": 0.4356, |
| "step": 15420 |
| }, |
| { |
| "epoch": 0.7539886144298663, |
| "grad_norm": 0.34054285287857056, |
| "learning_rate": 1.7340247910255193e-06, |
| "loss": 0.4347, |
| "step": 15430 |
| }, |
| { |
| "epoch": 0.7544772655085636, |
| "grad_norm": 0.548925518989563, |
| "learning_rate": 1.7275717822643496e-06, |
| "loss": 0.4356, |
| "step": 15440 |
| }, |
| { |
| "epoch": 0.7549659165872609, |
| "grad_norm": 0.3071838319301605, |
| "learning_rate": 1.7211282945528667e-06, |
| "loss": 0.4346, |
| "step": 15450 |
| }, |
| { |
| "epoch": 0.7554545676659581, |
| "grad_norm": 0.32380637526512146, |
| "learning_rate": 1.714694346638245e-06, |
| "loss": 0.4363, |
| "step": 15460 |
| }, |
| { |
| "epoch": 0.7559432187446554, |
| "grad_norm": 0.3220982253551483, |
| "learning_rate": 1.7082699572398941e-06, |
| "loss": 0.4356, |
| "step": 15470 |
| }, |
| { |
| "epoch": 0.7564318698233526, |
| "grad_norm": 0.48519644141197205, |
| "learning_rate": 1.7018551450494208e-06, |
| "loss": 0.4337, |
| "step": 15480 |
| }, |
| { |
| "epoch": 0.7569205209020499, |
| "grad_norm": 0.49619343876838684, |
| "learning_rate": 1.6954499287305625e-06, |
| "loss": 0.4359, |
| "step": 15490 |
| }, |
| { |
| "epoch": 0.7574091719807472, |
| "grad_norm": 0.31478312611579895, |
| "learning_rate": 1.6890543269191372e-06, |
| "loss": 0.4353, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.7574091719807472, |
| "eval_loss": 0.4151374399662018, |
| "eval_runtime": 729.456, |
| "eval_samples_per_second": 242.52, |
| "eval_steps_per_second": 0.474, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.7578978230594444, |
| "grad_norm": 0.5134409666061401, |
| "learning_rate": 4.396678065461651e-08, |
| "loss": 0.4363, |
| "step": 15510 |
| }, |
| { |
| "epoch": 0.7583864741381416, |
| "grad_norm": 0.3412030041217804, |
| "learning_rate": 9.281875915974597e-08, |
| "loss": 0.435, |
| "step": 15520 |
| }, |
| { |
| "epoch": 0.7588751252168389, |
| "grad_norm": 0.3823215365409851, |
| "learning_rate": 1.4167073766487544e-07, |
| "loss": 0.4359, |
| "step": 15530 |
| }, |
| { |
| "epoch": 0.7593637762955362, |
| "grad_norm": 0.3025282323360443, |
| "learning_rate": 1.905227161700049e-07, |
| "loss": 0.4355, |
| "step": 15540 |
| }, |
| { |
| "epoch": 0.7598524273742334, |
| "grad_norm": 0.4344797730445862, |
| "learning_rate": 2.3937469467513437e-07, |
| "loss": 0.4356, |
| "step": 15550 |
| }, |
| { |
| "epoch": 0.7603410784529306, |
| "grad_norm": 0.28436729311943054, |
| "learning_rate": 2.8822667318026384e-07, |
| "loss": 0.436, |
| "step": 15560 |
| }, |
| { |
| "epoch": 0.760829729531628, |
| "grad_norm": 0.3204064667224884, |
| "learning_rate": 3.3707865168539325e-07, |
| "loss": 0.4361, |
| "step": 15570 |
| }, |
| { |
| "epoch": 0.7613183806103252, |
| "grad_norm": 0.3875465989112854, |
| "learning_rate": 3.859306301905227e-07, |
| "loss": 0.4341, |
| "step": 15580 |
| }, |
| { |
| "epoch": 0.7618070316890224, |
| "grad_norm": 0.368078351020813, |
| "learning_rate": 4.347826086956522e-07, |
| "loss": 0.4351, |
| "step": 15590 |
| }, |
| { |
| "epoch": 0.7622956827677198, |
| "grad_norm": 0.36300018429756165, |
| "learning_rate": 4.836345872007817e-07, |
| "loss": 0.4344, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.762784333846417, |
| "grad_norm": 0.42110690474510193, |
| "learning_rate": 5.324865657059111e-07, |
| "loss": 0.434, |
| "step": 15610 |
| }, |
| { |
| "epoch": 0.7632729849251142, |
| "grad_norm": 0.37072572112083435, |
| "learning_rate": 5.813385442110406e-07, |
| "loss": 0.4354, |
| "step": 15620 |
| }, |
| { |
| "epoch": 0.7637616360038115, |
| "grad_norm": 0.5293629169464111, |
| "learning_rate": 6.3019052271617e-07, |
| "loss": 0.4342, |
| "step": 15630 |
| }, |
| { |
| "epoch": 0.7642502870825088, |
| "grad_norm": 0.31591010093688965, |
| "learning_rate": 6.790425012212995e-07, |
| "loss": 0.4343, |
| "step": 15640 |
| }, |
| { |
| "epoch": 0.764738938161206, |
| "grad_norm": 0.27564629912376404, |
| "learning_rate": 7.278944797264289e-07, |
| "loss": 0.4364, |
| "step": 15650 |
| }, |
| { |
| "epoch": 0.7652275892399032, |
| "grad_norm": 0.29514557123184204, |
| "learning_rate": 7.767464582315585e-07, |
| "loss": 0.4349, |
| "step": 15660 |
| }, |
| { |
| "epoch": 0.7657162403186005, |
| "grad_norm": 0.26547813415527344, |
| "learning_rate": 8.255984367366879e-07, |
| "loss": 0.4357, |
| "step": 15670 |
| }, |
| { |
| "epoch": 0.7662048913972977, |
| "grad_norm": 0.3546208441257477, |
| "learning_rate": 8.744504152418174e-07, |
| "loss": 0.4342, |
| "step": 15680 |
| }, |
| { |
| "epoch": 0.766693542475995, |
| "grad_norm": 0.6953465938568115, |
| "learning_rate": 9.233023937469468e-07, |
| "loss": 0.4339, |
| "step": 15690 |
| }, |
| { |
| "epoch": 0.7671821935546923, |
| "grad_norm": 0.37491822242736816, |
| "learning_rate": 9.721543722520762e-07, |
| "loss": 0.4357, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.7676708446333895, |
| "grad_norm": 0.4774235486984253, |
| "learning_rate": 1.0210063507572057e-06, |
| "loss": 0.435, |
| "step": 15710 |
| }, |
| { |
| "epoch": 0.7681594957120867, |
| "grad_norm": 0.47825121879577637, |
| "learning_rate": 1.0698583292623353e-06, |
| "loss": 0.4345, |
| "step": 15720 |
| }, |
| { |
| "epoch": 0.7686481467907841, |
| "grad_norm": 0.35943761467933655, |
| "learning_rate": 1.1187103077674646e-06, |
| "loss": 0.4345, |
| "step": 15730 |
| }, |
| { |
| "epoch": 0.7691367978694813, |
| "grad_norm": 0.41238027811050415, |
| "learning_rate": 1.167562286272594e-06, |
| "loss": 0.4351, |
| "step": 15740 |
| }, |
| { |
| "epoch": 0.7696254489481785, |
| "grad_norm": 0.5406340956687927, |
| "learning_rate": 1.2164142647777236e-06, |
| "loss": 0.4347, |
| "step": 15750 |
| }, |
| { |
| "epoch": 0.7701141000268759, |
| "grad_norm": 0.3181721568107605, |
| "learning_rate": 1.265266243282853e-06, |
| "loss": 0.4354, |
| "step": 15760 |
| }, |
| { |
| "epoch": 0.7706027511055731, |
| "grad_norm": 0.37955865263938904, |
| "learning_rate": 1.3141182217879824e-06, |
| "loss": 0.4347, |
| "step": 15770 |
| }, |
| { |
| "epoch": 0.7710914021842703, |
| "grad_norm": 0.3683488667011261, |
| "learning_rate": 1.362970200293112e-06, |
| "loss": 0.4361, |
| "step": 15780 |
| }, |
| { |
| "epoch": 0.7715800532629675, |
| "grad_norm": 0.3671647012233734, |
| "learning_rate": 1.4118221787982415e-06, |
| "loss": 0.4348, |
| "step": 15790 |
| }, |
| { |
| "epoch": 0.7720687043416649, |
| "grad_norm": 0.4749736189842224, |
| "learning_rate": 1.4606741573033708e-06, |
| "loss": 0.4354, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.7725573554203621, |
| "grad_norm": 0.2920779883861542, |
| "learning_rate": 1.5095261358085003e-06, |
| "loss": 0.4349, |
| "step": 15810 |
| }, |
| { |
| "epoch": 0.7730460064990593, |
| "grad_norm": 0.5698887705802917, |
| "learning_rate": 1.5583781143136298e-06, |
| "loss": 0.4349, |
| "step": 15820 |
| }, |
| { |
| "epoch": 0.7735346575777566, |
| "grad_norm": 0.4958445131778717, |
| "learning_rate": 1.6072300928187593e-06, |
| "loss": 0.4373, |
| "step": 15830 |
| }, |
| { |
| "epoch": 0.7740233086564539, |
| "grad_norm": 0.37633660435676575, |
| "learning_rate": 1.6560820713238887e-06, |
| "loss": 0.4356, |
| "step": 15840 |
| }, |
| { |
| "epoch": 0.7745119597351511, |
| "grad_norm": 0.3820544183254242, |
| "learning_rate": 1.7049340498290182e-06, |
| "loss": 0.4351, |
| "step": 15850 |
| }, |
| { |
| "epoch": 0.7750006108138484, |
| "grad_norm": 0.3899173140525818, |
| "learning_rate": 1.7537860283341477e-06, |
| "loss": 0.4355, |
| "step": 15860 |
| }, |
| { |
| "epoch": 0.7754892618925456, |
| "grad_norm": 0.36729347705841064, |
| "learning_rate": 1.802638006839277e-06, |
| "loss": 0.4353, |
| "step": 15870 |
| }, |
| { |
| "epoch": 0.7759779129712429, |
| "grad_norm": 0.442569762468338, |
| "learning_rate": 1.8514899853444065e-06, |
| "loss": 0.4363, |
| "step": 15880 |
| }, |
| { |
| "epoch": 0.7764665640499402, |
| "grad_norm": 0.5207741260528564, |
| "learning_rate": 1.900341963849536e-06, |
| "loss": 0.4362, |
| "step": 15890 |
| }, |
| { |
| "epoch": 0.7769552151286374, |
| "grad_norm": 0.901549756526947, |
| "learning_rate": 1.9491939423546656e-06, |
| "loss": 0.4359, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.7774438662073346, |
| "grad_norm": 0.5226088166236877, |
| "learning_rate": 1.998045920859795e-06, |
| "loss": 0.4362, |
| "step": 15910 |
| }, |
| { |
| "epoch": 0.7779325172860319, |
| "grad_norm": 0.7250573635101318, |
| "learning_rate": 2.046897899364924e-06, |
| "loss": 0.4374, |
| "step": 15920 |
| }, |
| { |
| "epoch": 0.7784211683647292, |
| "grad_norm": 0.34755152463912964, |
| "learning_rate": 2.0957498778700537e-06, |
| "loss": 0.4355, |
| "step": 15930 |
| }, |
| { |
| "epoch": 0.7789098194434264, |
| "grad_norm": 0.37030619382858276, |
| "learning_rate": 2.1446018563751832e-06, |
| "loss": 0.4356, |
| "step": 15940 |
| }, |
| { |
| "epoch": 0.7793984705221236, |
| "grad_norm": 0.44449082016944885, |
| "learning_rate": 2.1934538348803127e-06, |
| "loss": 0.4349, |
| "step": 15950 |
| }, |
| { |
| "epoch": 0.779887121600821, |
| "grad_norm": 1.273260235786438, |
| "learning_rate": 2.2423058133854423e-06, |
| "loss": 0.4356, |
| "step": 15960 |
| }, |
| { |
| "epoch": 0.7803757726795182, |
| "grad_norm": 0.6899981498718262, |
| "learning_rate": 2.2911577918905718e-06, |
| "loss": 0.4355, |
| "step": 15970 |
| }, |
| { |
| "epoch": 0.7808644237582154, |
| "grad_norm": 0.5005556344985962, |
| "learning_rate": 2.3400097703957013e-06, |
| "loss": 0.4347, |
| "step": 15980 |
| }, |
| { |
| "epoch": 0.7813530748369127, |
| "grad_norm": 0.5572786331176758, |
| "learning_rate": 2.388861748900831e-06, |
| "loss": 0.4357, |
| "step": 15990 |
| }, |
| { |
| "epoch": 0.78184172591561, |
| "grad_norm": 0.6249894499778748, |
| "learning_rate": 2.43771372740596e-06, |
| "loss": 0.4348, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.78184172591561, |
| "eval_loss": 0.4141230583190918, |
| "eval_runtime": 729.2488, |
| "eval_samples_per_second": 242.589, |
| "eval_steps_per_second": 0.474, |
| "step": 16000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 20465, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.7679175174984827e+19, |
| "train_batch_size": 256, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|