| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 2916, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005145356315924878, |
| "grad_norm": 3.516500949859619, |
| "learning_rate": 5.47945205479452e-07, |
| "loss": 1.0549, |
| "mean_token_accuracy": 0.7244073122739791, |
| "num_tokens": 10390477.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.010290712631849755, |
| "grad_norm": 2.3721697330474854, |
| "learning_rate": 1.2328767123287673e-06, |
| "loss": 1.0331, |
| "mean_token_accuracy": 0.7277479201555253, |
| "num_tokens": 20784784.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.015436068947774634, |
| "grad_norm": 1.1673978567123413, |
| "learning_rate": 1.9178082191780823e-06, |
| "loss": 1.0055, |
| "mean_token_accuracy": 0.7293060600757599, |
| "num_tokens": 31194057.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.02058142526369951, |
| "grad_norm": 1.1831053495407104, |
| "learning_rate": 2.6027397260273973e-06, |
| "loss": 0.9849, |
| "mean_token_accuracy": 0.7313222289085388, |
| "num_tokens": 41573889.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.025726781579624387, |
| "grad_norm": 0.7422630786895752, |
| "learning_rate": 3.2876712328767123e-06, |
| "loss": 0.9496, |
| "mean_token_accuracy": 0.7391268193721772, |
| "num_tokens": 51955847.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.030872137895549268, |
| "grad_norm": 0.4367648959159851, |
| "learning_rate": 3.972602739726027e-06, |
| "loss": 0.9444, |
| "mean_token_accuracy": 0.7393645942211151, |
| "num_tokens": 62362424.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03601749421147415, |
| "grad_norm": 0.38364696502685547, |
| "learning_rate": 4.657534246575343e-06, |
| "loss": 0.9117, |
| "mean_token_accuracy": 0.7468442142009735, |
| "num_tokens": 72760156.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.04116285052739902, |
| "grad_norm": 0.32832905650138855, |
| "learning_rate": 5.342465753424658e-06, |
| "loss": 0.9062, |
| "mean_token_accuracy": 0.7470937430858612, |
| "num_tokens": 83160696.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0463082068433239, |
| "grad_norm": 0.254768967628479, |
| "learning_rate": 6.027397260273973e-06, |
| "loss": 0.8938, |
| "mean_token_accuracy": 0.7495229691267014, |
| "num_tokens": 93541996.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.051453563159248775, |
| "grad_norm": 0.23051689565181732, |
| "learning_rate": 6.712328767123288e-06, |
| "loss": 0.9012, |
| "mean_token_accuracy": 0.7474822252988815, |
| "num_tokens": 103951096.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.056598919475173655, |
| "grad_norm": 0.20961317420005798, |
| "learning_rate": 7.397260273972603e-06, |
| "loss": 0.8874, |
| "mean_token_accuracy": 0.75033338367939, |
| "num_tokens": 114348089.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.061744275791098535, |
| "grad_norm": 0.20559196174144745, |
| "learning_rate": 8.082191780821919e-06, |
| "loss": 0.8788, |
| "mean_token_accuracy": 0.7523881673812867, |
| "num_tokens": 124723656.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.06688963210702341, |
| "grad_norm": 0.2094876766204834, |
| "learning_rate": 8.767123287671233e-06, |
| "loss": 0.87, |
| "mean_token_accuracy": 0.7535106301307678, |
| "num_tokens": 135146677.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.0720349884229483, |
| "grad_norm": 0.19069737195968628, |
| "learning_rate": 9.452054794520548e-06, |
| "loss": 0.8749, |
| "mean_token_accuracy": 0.7522967606782913, |
| "num_tokens": 145548072.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.07718034473887317, |
| "grad_norm": 0.19116425514221191, |
| "learning_rate": 1.0136986301369864e-05, |
| "loss": 0.8592, |
| "mean_token_accuracy": 0.7557980835437774, |
| "num_tokens": 155949133.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08232570105479804, |
| "grad_norm": 0.21261022984981537, |
| "learning_rate": 1.082191780821918e-05, |
| "loss": 0.8596, |
| "mean_token_accuracy": 0.7559153437614441, |
| "num_tokens": 166313331.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.08747105737072293, |
| "grad_norm": 0.19367730617523193, |
| "learning_rate": 1.1506849315068493e-05, |
| "loss": 0.8616, |
| "mean_token_accuracy": 0.7551171153783798, |
| "num_tokens": 176729555.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.0926164136866478, |
| "grad_norm": 0.18082526326179504, |
| "learning_rate": 1.219178082191781e-05, |
| "loss": 0.8489, |
| "mean_token_accuracy": 0.757958498597145, |
| "num_tokens": 187130427.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.09776177000257268, |
| "grad_norm": 0.20260317623615265, |
| "learning_rate": 1.2876712328767125e-05, |
| "loss": 0.8508, |
| "mean_token_accuracy": 0.7575758665800094, |
| "num_tokens": 197551850.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.10290712631849755, |
| "grad_norm": 0.23283614218235016, |
| "learning_rate": 1.356164383561644e-05, |
| "loss": 0.8489, |
| "mean_token_accuracy": 0.7578668266534805, |
| "num_tokens": 207961400.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.10805248263442244, |
| "grad_norm": 0.21379666030406952, |
| "learning_rate": 1.4246575342465754e-05, |
| "loss": 0.8339, |
| "mean_token_accuracy": 0.7610192090272904, |
| "num_tokens": 218376661.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.11319783895034731, |
| "grad_norm": 0.20584498345851898, |
| "learning_rate": 1.493150684931507e-05, |
| "loss": 0.8371, |
| "mean_token_accuracy": 0.7601476907730103, |
| "num_tokens": 228779024.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.11834319526627218, |
| "grad_norm": 0.22660651803016663, |
| "learning_rate": 1.5616438356164384e-05, |
| "loss": 0.8379, |
| "mean_token_accuracy": 0.7598486542701721, |
| "num_tokens": 239167993.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.12348855158219707, |
| "grad_norm": 0.21756519377231598, |
| "learning_rate": 1.6301369863013702e-05, |
| "loss": 0.833, |
| "mean_token_accuracy": 0.7604194134473801, |
| "num_tokens": 249533588.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.12863390789812196, |
| "grad_norm": 0.2544439733028412, |
| "learning_rate": 1.6986301369863014e-05, |
| "loss": 0.8267, |
| "mean_token_accuracy": 0.7623803347349167, |
| "num_tokens": 259910590.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.13377926421404682, |
| "grad_norm": 0.22863225638866425, |
| "learning_rate": 1.767123287671233e-05, |
| "loss": 0.8326, |
| "mean_token_accuracy": 0.7607358664274215, |
| "num_tokens": 270321780.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1389246205299717, |
| "grad_norm": 0.2177596092224121, |
| "learning_rate": 1.8356164383561645e-05, |
| "loss": 0.8181, |
| "mean_token_accuracy": 0.7643768191337585, |
| "num_tokens": 280754879.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.1440699768458966, |
| "grad_norm": 0.25882163643836975, |
| "learning_rate": 1.904109589041096e-05, |
| "loss": 0.83, |
| "mean_token_accuracy": 0.7610959112644196, |
| "num_tokens": 291123129.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.14921533316182145, |
| "grad_norm": 0.3098185062408447, |
| "learning_rate": 1.9726027397260276e-05, |
| "loss": 0.8421, |
| "mean_token_accuracy": 0.7577149778604507, |
| "num_tokens": 301526988.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.15436068947774634, |
| "grad_norm": 0.3141907751560211, |
| "learning_rate": 1.999995658762304e-05, |
| "loss": 0.8239, |
| "mean_token_accuracy": 0.7624665558338165, |
| "num_tokens": 311935967.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.15950604579367123, |
| "grad_norm": 0.2806973159313202, |
| "learning_rate": 1.999969129158383e-05, |
| "loss": 0.8149, |
| "mean_token_accuracy": 0.7648573398590088, |
| "num_tokens": 322344265.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.16465140210959608, |
| "grad_norm": 0.25377604365348816, |
| "learning_rate": 1.999918482601347e-05, |
| "loss": 0.8115, |
| "mean_token_accuracy": 0.7656003296375274, |
| "num_tokens": 332734169.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.16979675842552097, |
| "grad_norm": 0.2188454419374466, |
| "learning_rate": 1.9998437207198492e-05, |
| "loss": 0.8159, |
| "mean_token_accuracy": 0.7638908416032791, |
| "num_tokens": 343149437.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.17494211474144586, |
| "grad_norm": 0.284574031829834, |
| "learning_rate": 1.9997448459180285e-05, |
| "loss": 0.8125, |
| "mean_token_accuracy": 0.7646788358688354, |
| "num_tokens": 353540696.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.18008747105737072, |
| "grad_norm": 0.2330782562494278, |
| "learning_rate": 1.999621861375427e-05, |
| "loss": 0.8015, |
| "mean_token_accuracy": 0.7678918361663818, |
| "num_tokens": 363936147.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.1852328273732956, |
| "grad_norm": 0.3005109429359436, |
| "learning_rate": 1.9994747710468907e-05, |
| "loss": 0.8221, |
| "mean_token_accuracy": 0.7623626977205277, |
| "num_tokens": 374341093.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.19037818368922046, |
| "grad_norm": 0.2539622485637665, |
| "learning_rate": 1.9993035796624416e-05, |
| "loss": 0.8191, |
| "mean_token_accuracy": 0.7632038950920105, |
| "num_tokens": 384736305.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.19552354000514535, |
| "grad_norm": 0.3602747321128845, |
| "learning_rate": 1.9991082927271263e-05, |
| "loss": 0.8072, |
| "mean_token_accuracy": 0.765991085767746, |
| "num_tokens": 395134979.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.20066889632107024, |
| "grad_norm": 0.273478627204895, |
| "learning_rate": 1.9988889165208373e-05, |
| "loss": 0.8078, |
| "mean_token_accuracy": 0.7652726262807846, |
| "num_tokens": 405523607.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.2058142526369951, |
| "grad_norm": 0.2960168719291687, |
| "learning_rate": 1.998645458098112e-05, |
| "loss": 0.813, |
| "mean_token_accuracy": 0.7640766650438309, |
| "num_tokens": 415905850.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.21095960895291999, |
| "grad_norm": 0.23220957815647125, |
| "learning_rate": 1.998377925287908e-05, |
| "loss": 0.8086, |
| "mean_token_accuracy": 0.7652794599533081, |
| "num_tokens": 426274540.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.21610496526884487, |
| "grad_norm": 0.28992024064064026, |
| "learning_rate": 1.9980863266933464e-05, |
| "loss": 0.8179, |
| "mean_token_accuracy": 0.7626183509826661, |
| "num_tokens": 436689729.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.22125032158476973, |
| "grad_norm": 0.31990498304367065, |
| "learning_rate": 1.9977706716914402e-05, |
| "loss": 0.803, |
| "mean_token_accuracy": 0.7667577922344208, |
| "num_tokens": 447114386.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.22639567790069462, |
| "grad_norm": 0.3343593180179596, |
| "learning_rate": 1.997430970432789e-05, |
| "loss": 0.7933, |
| "mean_token_accuracy": 0.7693847328424454, |
| "num_tokens": 457532029.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2315410342166195, |
| "grad_norm": 0.24708902835845947, |
| "learning_rate": 1.9970672338412554e-05, |
| "loss": 0.7975, |
| "mean_token_accuracy": 0.768178117275238, |
| "num_tokens": 467953269.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.23668639053254437, |
| "grad_norm": 0.33174219727516174, |
| "learning_rate": 1.9966794736136114e-05, |
| "loss": 0.8089, |
| "mean_token_accuracy": 0.7650195062160492, |
| "num_tokens": 478360587.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.24183174684846925, |
| "grad_norm": 0.2640225291252136, |
| "learning_rate": 1.9962677022191648e-05, |
| "loss": 0.7919, |
| "mean_token_accuracy": 0.7692882120609283, |
| "num_tokens": 488764842.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.24697710316439414, |
| "grad_norm": 0.2795109450817108, |
| "learning_rate": 1.9958319328993553e-05, |
| "loss": 0.8134, |
| "mean_token_accuracy": 0.7633749455213547, |
| "num_tokens": 499181820.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.25212245948031903, |
| "grad_norm": 0.220581516623497, |
| "learning_rate": 1.99537217966733e-05, |
| "loss": 0.8017, |
| "mean_token_accuracy": 0.7669906347990036, |
| "num_tokens": 509595885.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.2572678157962439, |
| "grad_norm": 0.2826674282550812, |
| "learning_rate": 1.9948884573074948e-05, |
| "loss": 0.8109, |
| "mean_token_accuracy": 0.7639420449733734, |
| "num_tokens": 519993750.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.26241317211216875, |
| "grad_norm": 0.24169522523880005, |
| "learning_rate": 1.9943807813750356e-05, |
| "loss": 0.7981, |
| "mean_token_accuracy": 0.7674963772296906, |
| "num_tokens": 530411077.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.26755852842809363, |
| "grad_norm": 0.2998713552951813, |
| "learning_rate": 1.9938491681954196e-05, |
| "loss": 0.7992, |
| "mean_token_accuracy": 0.7669602394104004, |
| "num_tokens": 540817105.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.2727038847440185, |
| "grad_norm": 0.3029504716396332, |
| "learning_rate": 1.993293634863871e-05, |
| "loss": 0.7932, |
| "mean_token_accuracy": 0.7686622679233551, |
| "num_tokens": 551223833.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.2778492410599434, |
| "grad_norm": 0.263815701007843, |
| "learning_rate": 1.99271419924482e-05, |
| "loss": 0.7941, |
| "mean_token_accuracy": 0.7685212969779969, |
| "num_tokens": 561643697.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.2829945973758683, |
| "grad_norm": 0.3639836311340332, |
| "learning_rate": 1.992110879971329e-05, |
| "loss": 0.7973, |
| "mean_token_accuracy": 0.7674791067838669, |
| "num_tokens": 572054629.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.2881399536917932, |
| "grad_norm": 0.2445714920759201, |
| "learning_rate": 1.9914836964444934e-05, |
| "loss": 0.804, |
| "mean_token_accuracy": 0.7654124438762665, |
| "num_tokens": 582446471.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.293285310007718, |
| "grad_norm": 0.23441371321678162, |
| "learning_rate": 1.990832668832818e-05, |
| "loss": 0.7832, |
| "mean_token_accuracy": 0.7708245635032653, |
| "num_tokens": 592850162.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.2984306663236429, |
| "grad_norm": 0.27346476912498474, |
| "learning_rate": 1.9901578180715674e-05, |
| "loss": 0.7892, |
| "mean_token_accuracy": 0.7693257629871368, |
| "num_tokens": 603254165.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3035760226395678, |
| "grad_norm": 0.2540152966976166, |
| "learning_rate": 1.989459165862094e-05, |
| "loss": 0.7873, |
| "mean_token_accuracy": 0.7698075413703919, |
| "num_tokens": 613613424.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.3087213789554927, |
| "grad_norm": 0.24472405016422272, |
| "learning_rate": 1.9887367346711387e-05, |
| "loss": 0.7903, |
| "mean_token_accuracy": 0.7687265604734421, |
| "num_tokens": 624004700.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.31386673527141756, |
| "grad_norm": 0.2586674094200134, |
| "learning_rate": 1.987990547730111e-05, |
| "loss": 0.7843, |
| "mean_token_accuracy": 0.7710338205099105, |
| "num_tokens": 634387240.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.31901209158734245, |
| "grad_norm": 0.23755744099617004, |
| "learning_rate": 1.9872206290343384e-05, |
| "loss": 0.7865, |
| "mean_token_accuracy": 0.7694753050804138, |
| "num_tokens": 644778090.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3241574479032673, |
| "grad_norm": 0.3347889482975006, |
| "learning_rate": 1.9864270033422975e-05, |
| "loss": 0.7964, |
| "mean_token_accuracy": 0.7670842260122299, |
| "num_tokens": 655177170.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.32930280421919217, |
| "grad_norm": 0.2553674578666687, |
| "learning_rate": 1.985609696174817e-05, |
| "loss": 0.7909, |
| "mean_token_accuracy": 0.7687036842107773, |
| "num_tokens": 665585375.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.33444816053511706, |
| "grad_norm": 0.22028060257434845, |
| "learning_rate": 1.984768733814257e-05, |
| "loss": 0.7835, |
| "mean_token_accuracy": 0.7702562361955643, |
| "num_tokens": 675996436.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.33959351685104194, |
| "grad_norm": 0.24218259751796722, |
| "learning_rate": 1.9839041433036636e-05, |
| "loss": 0.7882, |
| "mean_token_accuracy": 0.7690007984638214, |
| "num_tokens": 686407890.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.34473887316696683, |
| "grad_norm": 0.2648875117301941, |
| "learning_rate": 1.9830159524459e-05, |
| "loss": 0.7914, |
| "mean_token_accuracy": 0.7680494576692581, |
| "num_tokens": 696772298.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3498842294828917, |
| "grad_norm": 0.3156125247478485, |
| "learning_rate": 1.982104189802751e-05, |
| "loss": 0.7882, |
| "mean_token_accuracy": 0.7689610362052918, |
| "num_tokens": 707159995.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.35502958579881655, |
| "grad_norm": 0.21199235320091248, |
| "learning_rate": 1.9811688846940064e-05, |
| "loss": 0.7973, |
| "mean_token_accuracy": 0.7667856603860855, |
| "num_tokens": 717555566.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.36017494211474144, |
| "grad_norm": 0.2736167013645172, |
| "learning_rate": 1.9802100671965167e-05, |
| "loss": 0.7927, |
| "mean_token_accuracy": 0.767885434627533, |
| "num_tokens": 727966808.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.3653202984306663, |
| "grad_norm": 0.22648128867149353, |
| "learning_rate": 1.9792277681432257e-05, |
| "loss": 0.7907, |
| "mean_token_accuracy": 0.7685991823673248, |
| "num_tokens": 738371625.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.3704656547465912, |
| "grad_norm": 0.26248160004615784, |
| "learning_rate": 1.9782220191221818e-05, |
| "loss": 0.7972, |
| "mean_token_accuracy": 0.7668163865804672, |
| "num_tokens": 748771291.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.3756110110625161, |
| "grad_norm": 0.23813173174858093, |
| "learning_rate": 1.9771928524755182e-05, |
| "loss": 0.7871, |
| "mean_token_accuracy": 0.7691414833068848, |
| "num_tokens": 759119097.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.38075636737844093, |
| "grad_norm": 0.32292458415031433, |
| "learning_rate": 1.976140301298416e-05, |
| "loss": 0.7858, |
| "mean_token_accuracy": 0.7693786770105362, |
| "num_tokens": 769547626.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.3859017236943658, |
| "grad_norm": 0.23817549645900726, |
| "learning_rate": 1.9750643994380377e-05, |
| "loss": 0.7841, |
| "mean_token_accuracy": 0.7698348790407181, |
| "num_tokens": 779970748.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.3910470800102907, |
| "grad_norm": 0.2122715711593628, |
| "learning_rate": 1.9739651814924404e-05, |
| "loss": 0.79, |
| "mean_token_accuracy": 0.76830395758152, |
| "num_tokens": 790354749.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.3961924363262156, |
| "grad_norm": 0.25851792097091675, |
| "learning_rate": 1.972842682809463e-05, |
| "loss": 0.7844, |
| "mean_token_accuracy": 0.7702566295862198, |
| "num_tokens": 800785265.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.4013377926421405, |
| "grad_norm": 0.24672254920005798, |
| "learning_rate": 1.9716969394855884e-05, |
| "loss": 0.7768, |
| "mean_token_accuracy": 0.7723254442214966, |
| "num_tokens": 811181165.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.40648314895806537, |
| "grad_norm": 0.23226742446422577, |
| "learning_rate": 1.9705279883647842e-05, |
| "loss": 0.7809, |
| "mean_token_accuracy": 0.7711203783750534, |
| "num_tokens": 821562722.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.4116285052739902, |
| "grad_norm": 0.22964943945407867, |
| "learning_rate": 1.9693358670373162e-05, |
| "loss": 0.7772, |
| "mean_token_accuracy": 0.7718722522258759, |
| "num_tokens": 831975336.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4167738615899151, |
| "grad_norm": 0.2204572707414627, |
| "learning_rate": 1.9681206138385418e-05, |
| "loss": 0.7829, |
| "mean_token_accuracy": 0.7703472405672074, |
| "num_tokens": 842370514.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.42191921790583997, |
| "grad_norm": 0.21690765023231506, |
| "learning_rate": 1.966882267847675e-05, |
| "loss": 0.7778, |
| "mean_token_accuracy": 0.7715538173913956, |
| "num_tokens": 852764466.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.42706457422176486, |
| "grad_norm": 0.2913786768913269, |
| "learning_rate": 1.9656208688865318e-05, |
| "loss": 0.7806, |
| "mean_token_accuracy": 0.7708995878696442, |
| "num_tokens": 863181464.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.43220993053768975, |
| "grad_norm": 0.2708909809589386, |
| "learning_rate": 1.9643364575182474e-05, |
| "loss": 0.7853, |
| "mean_token_accuracy": 0.7692730128765106, |
| "num_tokens": 873548841.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.43735528685361463, |
| "grad_norm": 0.20992125570774078, |
| "learning_rate": 1.9630290750459733e-05, |
| "loss": 0.7835, |
| "mean_token_accuracy": 0.7699774980545044, |
| "num_tokens": 883941994.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.44250064316953946, |
| "grad_norm": 0.21337294578552246, |
| "learning_rate": 1.9616987635115502e-05, |
| "loss": 0.7725, |
| "mean_token_accuracy": 0.7728747427463531, |
| "num_tokens": 894335136.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.44764599948546435, |
| "grad_norm": 0.30405837297439575, |
| "learning_rate": 1.9603455656941518e-05, |
| "loss": 0.7813, |
| "mean_token_accuracy": 0.7705006301403046, |
| "num_tokens": 904737119.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.45279135580138924, |
| "grad_norm": 0.22999157011508942, |
| "learning_rate": 1.9589695251089154e-05, |
| "loss": 0.7804, |
| "mean_token_accuracy": 0.7707367807626724, |
| "num_tokens": 915122419.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.4579367121173141, |
| "grad_norm": 0.37796908617019653, |
| "learning_rate": 1.9575706860055363e-05, |
| "loss": 0.7859, |
| "mean_token_accuracy": 0.7693438589572906, |
| "num_tokens": 925494224.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.463082068433239, |
| "grad_norm": 0.4048900306224823, |
| "learning_rate": 1.9561490933668492e-05, |
| "loss": 0.79, |
| "mean_token_accuracy": 0.7678173094987869, |
| "num_tokens": 935892275.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.4682274247491639, |
| "grad_norm": 0.2899525463581085, |
| "learning_rate": 1.95470479290738e-05, |
| "loss": 0.773, |
| "mean_token_accuracy": 0.7723690897226334, |
| "num_tokens": 946289615.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.47337278106508873, |
| "grad_norm": 0.22180290520191193, |
| "learning_rate": 1.9532378310718745e-05, |
| "loss": 0.7721, |
| "mean_token_accuracy": 0.7729926645755768, |
| "num_tokens": 956672790.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.4785181373810136, |
| "grad_norm": 0.32568320631980896, |
| "learning_rate": 1.951748255033809e-05, |
| "loss": 0.7845, |
| "mean_token_accuracy": 0.7696036785840988, |
| "num_tokens": 967100741.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.4836634936969385, |
| "grad_norm": 0.21786752343177795, |
| "learning_rate": 1.9502361126938683e-05, |
| "loss": 0.7769, |
| "mean_token_accuracy": 0.7711740046739578, |
| "num_tokens": 977497458.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.4888088500128634, |
| "grad_norm": 0.274349570274353, |
| "learning_rate": 1.9487014526784088e-05, |
| "loss": 0.7717, |
| "mean_token_accuracy": 0.772866228222847, |
| "num_tokens": 987914143.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.4939542063287883, |
| "grad_norm": 0.27368083596229553, |
| "learning_rate": 1.9471443243378934e-05, |
| "loss": 0.7812, |
| "mean_token_accuracy": 0.7704960852861404, |
| "num_tokens": 998321473.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.49909956264471317, |
| "grad_norm": 0.2256074994802475, |
| "learning_rate": 1.9455647777453045e-05, |
| "loss": 0.7819, |
| "mean_token_accuracy": 0.7698213875293731, |
| "num_tokens": 1008748099.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.5042449189606381, |
| "grad_norm": 0.2857572138309479, |
| "learning_rate": 1.9439628636945337e-05, |
| "loss": 0.7816, |
| "mean_token_accuracy": 0.7701322674751282, |
| "num_tokens": 1019143713.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5093902752765629, |
| "grad_norm": 1.1966010332107544, |
| "learning_rate": 1.9423386336987507e-05, |
| "loss": 0.7727, |
| "mean_token_accuracy": 0.7724141061306, |
| "num_tokens": 1029507845.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.5145356315924878, |
| "grad_norm": 0.40811723470687866, |
| "learning_rate": 1.9406921399887432e-05, |
| "loss": 0.8462, |
| "mean_token_accuracy": 0.7615606904029846, |
| "num_tokens": 1039924539.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5196809879084127, |
| "grad_norm": 0.43425825238227844, |
| "learning_rate": 1.9390234355112386e-05, |
| "loss": 0.776, |
| "mean_token_accuracy": 0.7718711495399475, |
| "num_tokens": 1050307833.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.5248263442243375, |
| "grad_norm": 0.2550142705440521, |
| "learning_rate": 1.9373325739272035e-05, |
| "loss": 0.7718, |
| "mean_token_accuracy": 0.772559967637062, |
| "num_tokens": 1060725805.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.5299717005402624, |
| "grad_norm": 0.2491987943649292, |
| "learning_rate": 1.9356196096101145e-05, |
| "loss": 0.7818, |
| "mean_token_accuracy": 0.7696256130933762, |
| "num_tokens": 1071091088.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.5351170568561873, |
| "grad_norm": 0.2185741513967514, |
| "learning_rate": 1.9338845976442128e-05, |
| "loss": 0.7696, |
| "mean_token_accuracy": 0.7729953050613403, |
| "num_tokens": 1081507707.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.5402624131721122, |
| "grad_norm": 0.22735682129859924, |
| "learning_rate": 1.9321275938227315e-05, |
| "loss": 0.7846, |
| "mean_token_accuracy": 0.7691415637731552, |
| "num_tokens": 1091915702.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.545407769488037, |
| "grad_norm": 0.2260189801454544, |
| "learning_rate": 1.930348654646101e-05, |
| "loss": 0.7783, |
| "mean_token_accuracy": 0.7702914178371429, |
| "num_tokens": 1102329794.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.5505531258039619, |
| "grad_norm": 0.24121206998825073, |
| "learning_rate": 1.928547837320133e-05, |
| "loss": 0.7793, |
| "mean_token_accuracy": 0.7705663651227951, |
| "num_tokens": 1112744574.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.5556984821198868, |
| "grad_norm": 0.21719065308570862, |
| "learning_rate": 1.92672519975418e-05, |
| "loss": 0.7709, |
| "mean_token_accuracy": 0.7725832790136338, |
| "num_tokens": 1123133981.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.5608438384358116, |
| "grad_norm": 0.2318485677242279, |
| "learning_rate": 1.9248808005592748e-05, |
| "loss": 0.7593, |
| "mean_token_accuracy": 0.7762016743421555, |
| "num_tokens": 1133547201.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.5659891947517366, |
| "grad_norm": 0.22322027385234833, |
| "learning_rate": 1.923014699046244e-05, |
| "loss": 0.7781, |
| "mean_token_accuracy": 0.7703386157751083, |
| "num_tokens": 1143928590.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.5711345510676614, |
| "grad_norm": 0.25323280692100525, |
| "learning_rate": 1.9211269552238006e-05, |
| "loss": 0.7674, |
| "mean_token_accuracy": 0.7730859339237213, |
| "num_tokens": 1154289231.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.5762799073835864, |
| "grad_norm": 0.19904294610023499, |
| "learning_rate": 1.919217629796616e-05, |
| "loss": 0.7562, |
| "mean_token_accuracy": 0.7763365268707275, |
| "num_tokens": 1164696083.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.5814252636995112, |
| "grad_norm": 0.20939859747886658, |
| "learning_rate": 1.917286784163366e-05, |
| "loss": 0.7619, |
| "mean_token_accuracy": 0.7748125195503235, |
| "num_tokens": 1175092473.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.586570620015436, |
| "grad_norm": 0.19839182496070862, |
| "learning_rate": 1.9153344804147583e-05, |
| "loss": 0.758, |
| "mean_token_accuracy": 0.7757725417613983, |
| "num_tokens": 1185494160.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.591715976331361, |
| "grad_norm": 0.2394329458475113, |
| "learning_rate": 1.913360781331535e-05, |
| "loss": 0.7761, |
| "mean_token_accuracy": 0.770972666144371, |
| "num_tokens": 1195885099.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.5968613326472858, |
| "grad_norm": 0.22963404655456543, |
| "learning_rate": 1.9113657503824513e-05, |
| "loss": 0.7632, |
| "mean_token_accuracy": 0.7746537119150162, |
| "num_tokens": 1206298358.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.6020066889632107, |
| "grad_norm": 0.2229454070329666, |
| "learning_rate": 1.9093494517222397e-05, |
| "loss": 0.7843, |
| "mean_token_accuracy": 0.7687952756881714, |
| "num_tokens": 1216694302.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.6071520452791356, |
| "grad_norm": 0.24715222418308258, |
| "learning_rate": 1.907311950189542e-05, |
| "loss": 0.7782, |
| "mean_token_accuracy": 0.770742443203926, |
| "num_tokens": 1227102730.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.6122974015950604, |
| "grad_norm": 0.26584967970848083, |
| "learning_rate": 1.9052533113048274e-05, |
| "loss": 0.7656, |
| "mean_token_accuracy": 0.7737416863441468, |
| "num_tokens": 1237504190.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.6174427579109854, |
| "grad_norm": 0.23727792501449585, |
| "learning_rate": 1.903173601268284e-05, |
| "loss": 0.767, |
| "mean_token_accuracy": 0.7734443098306656, |
| "num_tokens": 1247914802.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6225881142269102, |
| "grad_norm": 0.22145575284957886, |
| "learning_rate": 1.90107288695769e-05, |
| "loss": 0.7717, |
| "mean_token_accuracy": 0.7720071583986282, |
| "num_tokens": 1258346476.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.6277334705428351, |
| "grad_norm": 0.2083977609872818, |
| "learning_rate": 1.8989512359262643e-05, |
| "loss": 0.7679, |
| "mean_token_accuracy": 0.7729957222938537, |
| "num_tokens": 1268749806.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.63287882685876, |
| "grad_norm": 0.21598635613918304, |
| "learning_rate": 1.8968087164004935e-05, |
| "loss": 0.7662, |
| "mean_token_accuracy": 0.7731610238552094, |
| "num_tokens": 1279117698.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.6380241831746849, |
| "grad_norm": 0.19322967529296875, |
| "learning_rate": 1.894645397277937e-05, |
| "loss": 0.7502, |
| "mean_token_accuracy": 0.7780898064374924, |
| "num_tokens": 1289533458.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.6431695394906097, |
| "grad_norm": 0.19693732261657715, |
| "learning_rate": 1.8924613481250128e-05, |
| "loss": 0.7727, |
| "mean_token_accuracy": 0.7718368798494339, |
| "num_tokens": 1299925979.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.6483148958065346, |
| "grad_norm": 0.21000495553016663, |
| "learning_rate": 1.8902566391747596e-05, |
| "loss": 0.7734, |
| "mean_token_accuracy": 0.7717230170965195, |
| "num_tokens": 1310311084.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.6534602521224595, |
| "grad_norm": 0.2045949399471283, |
| "learning_rate": 1.8880313413245794e-05, |
| "loss": 0.7717, |
| "mean_token_accuracy": 0.7715155422687531, |
| "num_tokens": 1320686452.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.6586056084383843, |
| "grad_norm": 0.22864918410778046, |
| "learning_rate": 1.885785526133956e-05, |
| "loss": 0.7579, |
| "mean_token_accuracy": 0.7755870014429093, |
| "num_tokens": 1331096443.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.6637509647543093, |
| "grad_norm": 0.2280014157295227, |
| "learning_rate": 1.8835192658221545e-05, |
| "loss": 0.7643, |
| "mean_token_accuracy": 0.7744101166725159, |
| "num_tokens": 1341514376.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.6688963210702341, |
| "grad_norm": 0.224672332406044, |
| "learning_rate": 1.8812326332658997e-05, |
| "loss": 0.7674, |
| "mean_token_accuracy": 0.773426678776741, |
| "num_tokens": 1351882404.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.6740416773861589, |
| "grad_norm": 0.2680542767047882, |
| "learning_rate": 1.878925701997032e-05, |
| "loss": 0.7573, |
| "mean_token_accuracy": 0.7760124772787094, |
| "num_tokens": 1362304069.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.6791870337020839, |
| "grad_norm": 0.29469063878059387, |
| "learning_rate": 1.8765985462001424e-05, |
| "loss": 0.7636, |
| "mean_token_accuracy": 0.7740887552499771, |
| "num_tokens": 1372688301.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.6843323900180087, |
| "grad_norm": 0.21097056567668915, |
| "learning_rate": 1.8742512407101875e-05, |
| "loss": 0.7578, |
| "mean_token_accuracy": 0.7753616005182267, |
| "num_tokens": 1383093337.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.6894777463339337, |
| "grad_norm": 0.21591758728027344, |
| "learning_rate": 1.8718838610100832e-05, |
| "loss": 0.7566, |
| "mean_token_accuracy": 0.7760109454393387, |
| "num_tokens": 1393440159.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.6946231026498585, |
| "grad_norm": 0.2419527769088745, |
| "learning_rate": 1.8694964832282764e-05, |
| "loss": 0.77, |
| "mean_token_accuracy": 0.7725261867046356, |
| "num_tokens": 1403836556.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.6997684589657834, |
| "grad_norm": 0.27528461813926697, |
| "learning_rate": 1.8670891841362976e-05, |
| "loss": 0.7543, |
| "mean_token_accuracy": 0.7764055013656617, |
| "num_tokens": 1414213366.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.7049138152817083, |
| "grad_norm": 0.2578127980232239, |
| "learning_rate": 1.8646620411462924e-05, |
| "loss": 0.7802, |
| "mean_token_accuracy": 0.7694523215293885, |
| "num_tokens": 1424633658.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.7100591715976331, |
| "grad_norm": 0.23547138273715973, |
| "learning_rate": 1.8622151323085317e-05, |
| "loss": 0.7573, |
| "mean_token_accuracy": 0.7754601895809173, |
| "num_tokens": 1435040726.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.715204527913558, |
| "grad_norm": 0.28163185715675354, |
| "learning_rate": 1.8597485363089026e-05, |
| "loss": 0.7574, |
| "mean_token_accuracy": 0.7757174968719482, |
| "num_tokens": 1445446395.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.7203498842294829, |
| "grad_norm": 0.2508062422275543, |
| "learning_rate": 1.8572623324663756e-05, |
| "loss": 0.767, |
| "mean_token_accuracy": 0.7730412214994431, |
| "num_tokens": 1455855217.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.7254952405454078, |
| "grad_norm": 0.20320753753185272, |
| "learning_rate": 1.8547566007304577e-05, |
| "loss": 0.7687, |
| "mean_token_accuracy": 0.7726406931877137, |
| "num_tokens": 1466245798.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.7306405968613326, |
| "grad_norm": 0.2486068159341812, |
| "learning_rate": 1.8522314216786186e-05, |
| "loss": 0.7559, |
| "mean_token_accuracy": 0.77595334649086, |
| "num_tokens": 1476629034.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.7357859531772575, |
| "grad_norm": 0.20530639588832855, |
| "learning_rate": 1.8496868765136996e-05, |
| "loss": 0.758, |
| "mean_token_accuracy": 0.7758067876100541, |
| "num_tokens": 1487047420.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.7409313094931824, |
| "grad_norm": 0.20521405339241028, |
| "learning_rate": 1.8471230470613046e-05, |
| "loss": 0.7661, |
| "mean_token_accuracy": 0.7729949295520783, |
| "num_tokens": 1497477237.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.7460766658091073, |
| "grad_norm": 0.2346915453672409, |
| "learning_rate": 1.844540015767167e-05, |
| "loss": 0.7633, |
| "mean_token_accuracy": 0.7740152984857559, |
| "num_tokens": 1507862470.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.7512220221250322, |
| "grad_norm": 0.25663813948631287, |
| "learning_rate": 1.8419378656944983e-05, |
| "loss": 0.7633, |
| "mean_token_accuracy": 0.7739923149347305, |
| "num_tokens": 1518284565.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.756367378440957, |
| "grad_norm": 0.19483357667922974, |
| "learning_rate": 1.8393166805213178e-05, |
| "loss": 0.7564, |
| "mean_token_accuracy": 0.7762584149837494, |
| "num_tokens": 1528673299.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.7615127347568819, |
| "grad_norm": 0.298998087644577, |
| "learning_rate": 1.8366765445377614e-05, |
| "loss": 0.7634, |
| "mean_token_accuracy": 0.7737476319074631, |
| "num_tokens": 1539078416.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.7666580910728068, |
| "grad_norm": 0.22814105451107025, |
| "learning_rate": 1.834017542643372e-05, |
| "loss": 0.769, |
| "mean_token_accuracy": 0.7724894404411315, |
| "num_tokens": 1549479202.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.7718034473887316, |
| "grad_norm": 0.20318229496479034, |
| "learning_rate": 1.8313397603443665e-05, |
| "loss": 0.7508, |
| "mean_token_accuracy": 0.7772645950317383, |
| "num_tokens": 1559862298.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.7769488037046566, |
| "grad_norm": 0.28188684582710266, |
| "learning_rate": 1.828643283750891e-05, |
| "loss": 0.7518, |
| "mean_token_accuracy": 0.776808711886406, |
| "num_tokens": 1570288616.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.7820941600205814, |
| "grad_norm": 0.24467986822128296, |
| "learning_rate": 1.8259281995742467e-05, |
| "loss": 0.7663, |
| "mean_token_accuracy": 0.7730655431747436, |
| "num_tokens": 1580705251.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.7872395163365064, |
| "grad_norm": 0.24917523562908173, |
| "learning_rate": 1.8231945951241043e-05, |
| "loss": 0.7625, |
| "mean_token_accuracy": 0.773466631770134, |
| "num_tokens": 1591106421.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.7923848726524312, |
| "grad_norm": 0.18945878744125366, |
| "learning_rate": 1.8204425583056962e-05, |
| "loss": 0.7507, |
| "mean_token_accuracy": 0.7773733377456665, |
| "num_tokens": 1601520172.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.797530228968356, |
| "grad_norm": 0.2917018234729767, |
| "learning_rate": 1.817672177616989e-05, |
| "loss": 0.7604, |
| "mean_token_accuracy": 0.7747350037097931, |
| "num_tokens": 1611899934.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.802675585284281, |
| "grad_norm": 0.24668017029762268, |
| "learning_rate": 1.8148835421458374e-05, |
| "loss": 0.7601, |
| "mean_token_accuracy": 0.7746721476316452, |
| "num_tokens": 1622304284.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.8078209416002058, |
| "grad_norm": 0.3010805547237396, |
| "learning_rate": 1.8120767415671208e-05, |
| "loss": 0.7715, |
| "mean_token_accuracy": 0.7717162489891052, |
| "num_tokens": 1632665638.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.8129662979161307, |
| "grad_norm": 0.2387019842863083, |
| "learning_rate": 1.809251866139858e-05, |
| "loss": 0.7631, |
| "mean_token_accuracy": 0.773768350481987, |
| "num_tokens": 1643065667.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.8181116542320556, |
| "grad_norm": 0.207286074757576, |
| "learning_rate": 1.8064090067043066e-05, |
| "loss": 0.7596, |
| "mean_token_accuracy": 0.7746483981609344, |
| "num_tokens": 1653489023.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.8232570105479804, |
| "grad_norm": 0.20019465684890747, |
| "learning_rate": 1.8035482546790387e-05, |
| "loss": 0.7619, |
| "mean_token_accuracy": 0.7740916252136231, |
| "num_tokens": 1663916269.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.8284023668639053, |
| "grad_norm": 0.24347372353076935, |
| "learning_rate": 1.8006697020580048e-05, |
| "loss": 0.7614, |
| "mean_token_accuracy": 0.7742874711751938, |
| "num_tokens": 1674339518.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.8335477231798302, |
| "grad_norm": 0.2331140786409378, |
| "learning_rate": 1.7977734414075728e-05, |
| "loss": 0.7542, |
| "mean_token_accuracy": 0.7760845631361007, |
| "num_tokens": 1684753767.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.8386930794957551, |
| "grad_norm": 0.26996636390686035, |
| "learning_rate": 1.7948595658635533e-05, |
| "loss": 0.7644, |
| "mean_token_accuracy": 0.7731517612934112, |
| "num_tokens": 1695167774.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.8438384358116799, |
| "grad_norm": 0.23208890855312347, |
| "learning_rate": 1.791928169128202e-05, |
| "loss": 0.754, |
| "mean_token_accuracy": 0.7757177084684372, |
| "num_tokens": 1705535908.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.8489837921276049, |
| "grad_norm": 0.22476951777935028, |
| "learning_rate": 1.7889793454672104e-05, |
| "loss": 0.757, |
| "mean_token_accuracy": 0.7750364452600479, |
| "num_tokens": 1715945968.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.8541291484435297, |
| "grad_norm": 0.2300329953432083, |
| "learning_rate": 1.7860131897066702e-05, |
| "loss": 0.7687, |
| "mean_token_accuracy": 0.7721786022186279, |
| "num_tokens": 1726366494.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.8592745047594545, |
| "grad_norm": 0.21463564038276672, |
| "learning_rate": 1.7830297972300266e-05, |
| "loss": 0.7632, |
| "mean_token_accuracy": 0.773630577325821, |
| "num_tokens": 1736754760.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.8644198610753795, |
| "grad_norm": 0.20610825717449188, |
| "learning_rate": 1.780029263975011e-05, |
| "loss": 0.7428, |
| "mean_token_accuracy": 0.7793777525424957, |
| "num_tokens": 1747169397.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.8695652173913043, |
| "grad_norm": 0.17724700272083282, |
| "learning_rate": 1.7770116864305543e-05, |
| "loss": 0.7542, |
| "mean_token_accuracy": 0.7759394317865371, |
| "num_tokens": 1757562579.0, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.8747105737072293, |
| "grad_norm": 0.20515885949134827, |
| "learning_rate": 1.773977161633686e-05, |
| "loss": 0.7585, |
| "mean_token_accuracy": 0.774996566772461, |
| "num_tokens": 1767951452.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.8798559300231541, |
| "grad_norm": 0.21575596928596497, |
| "learning_rate": 1.770925787166412e-05, |
| "loss": 0.7352, |
| "mean_token_accuracy": 0.7808972954750061, |
| "num_tokens": 1778357806.0, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.8850012863390789, |
| "grad_norm": 0.22368930280208588, |
| "learning_rate": 1.767857661152578e-05, |
| "loss": 0.7557, |
| "mean_token_accuracy": 0.775502935051918, |
| "num_tokens": 1788767633.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.8901466426550039, |
| "grad_norm": 0.2021559774875641, |
| "learning_rate": 1.7647728822547126e-05, |
| "loss": 0.7609, |
| "mean_token_accuracy": 0.7740152478218079, |
| "num_tokens": 1799164623.0, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.8952919989709287, |
| "grad_norm": 0.19145594537258148, |
| "learning_rate": 1.7616715496708575e-05, |
| "loss": 0.7562, |
| "mean_token_accuracy": 0.7754783451557159, |
| "num_tokens": 1809537031.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.9004373552868536, |
| "grad_norm": 0.19838295876979828, |
| "learning_rate": 1.7585537631313738e-05, |
| "loss": 0.7554, |
| "mean_token_accuracy": 0.7755459159612655, |
| "num_tokens": 1819917273.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.9055827116027785, |
| "grad_norm": 0.18983587622642517, |
| "learning_rate": 1.7554196228957374e-05, |
| "loss": 0.7629, |
| "mean_token_accuracy": 0.7733017027378082, |
| "num_tokens": 1830301370.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.9107280679187034, |
| "grad_norm": 0.2382967472076416, |
| "learning_rate": 1.7522692297493145e-05, |
| "loss": 0.7545, |
| "mean_token_accuracy": 0.775650081038475, |
| "num_tokens": 1840697377.0, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.9158734242346283, |
| "grad_norm": 0.2184479981660843, |
| "learning_rate": 1.7491026850001195e-05, |
| "loss": 0.761, |
| "mean_token_accuracy": 0.7739459365606308, |
| "num_tokens": 1851095319.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.9210187805505531, |
| "grad_norm": 0.20179307460784912, |
| "learning_rate": 1.745920090475559e-05, |
| "loss": 0.7654, |
| "mean_token_accuracy": 0.7729564756155014, |
| "num_tokens": 1861483986.0, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.926164136866478, |
| "grad_norm": 0.2453368604183197, |
| "learning_rate": 1.7427215485191567e-05, |
| "loss": 0.7555, |
| "mean_token_accuracy": 0.7756044954061508, |
| "num_tokens": 1871857137.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.9313094931824029, |
| "grad_norm": 0.21209710836410522, |
| "learning_rate": 1.739507161987261e-05, |
| "loss": 0.7615, |
| "mean_token_accuracy": 0.7739556908607483, |
| "num_tokens": 1882273427.0, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.9364548494983278, |
| "grad_norm": 0.18556725978851318, |
| "learning_rate": 1.736277034245739e-05, |
| "loss": 0.7557, |
| "mean_token_accuracy": 0.7750442743301391, |
| "num_tokens": 1892660604.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.9416002058142526, |
| "grad_norm": 0.2182847410440445, |
| "learning_rate": 1.7330312691666517e-05, |
| "loss": 0.7592, |
| "mean_token_accuracy": 0.7745744317770005, |
| "num_tokens": 1903083350.0, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.9467455621301775, |
| "grad_norm": 0.20762157440185547, |
| "learning_rate": 1.7297699711249144e-05, |
| "loss": 0.7481, |
| "mean_token_accuracy": 0.7772045373916626, |
| "num_tokens": 1913484635.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.9518909184461024, |
| "grad_norm": 0.22431345283985138, |
| "learning_rate": 1.7264932449949403e-05, |
| "loss": 0.752, |
| "mean_token_accuracy": 0.7767430722713471, |
| "num_tokens": 1923902157.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.9570362747620272, |
| "grad_norm": 0.21677188575267792, |
| "learning_rate": 1.7232011961472666e-05, |
| "loss": 0.751, |
| "mean_token_accuracy": 0.776458004117012, |
| "num_tokens": 1934287093.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.9621816310779522, |
| "grad_norm": 0.24291808903217316, |
| "learning_rate": 1.7198939304451677e-05, |
| "loss": 0.7621, |
| "mean_token_accuracy": 0.7737038463354111, |
| "num_tokens": 1944670753.0, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.967326987393877, |
| "grad_norm": 0.18869860470294952, |
| "learning_rate": 1.7165715542412505e-05, |
| "loss": 0.7474, |
| "mean_token_accuracy": 0.7775454163551331, |
| "num_tokens": 1955041733.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.972472343709802, |
| "grad_norm": 0.2036639004945755, |
| "learning_rate": 1.7132341743740343e-05, |
| "loss": 0.7591, |
| "mean_token_accuracy": 0.7744521021842956, |
| "num_tokens": 1965413435.0, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.9776177000257268, |
| "grad_norm": 0.19717492163181305, |
| "learning_rate": 1.709881898164515e-05, |
| "loss": 0.7626, |
| "mean_token_accuracy": 0.7734819889068604, |
| "num_tokens": 1975831716.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.9827630563416516, |
| "grad_norm": 0.20819610357284546, |
| "learning_rate": 1.7065148334127137e-05, |
| "loss": 0.7622, |
| "mean_token_accuracy": 0.7739403694868088, |
| "num_tokens": 1986221267.0, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.9879084126575766, |
| "grad_norm": 0.2304711937904358, |
| "learning_rate": 1.7031330883942106e-05, |
| "loss": 0.7604, |
| "mean_token_accuracy": 0.7743852972984314, |
| "num_tokens": 1996637419.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.9930537689735014, |
| "grad_norm": 0.20585313439369202, |
| "learning_rate": 1.699736771856664e-05, |
| "loss": 0.7417, |
| "mean_token_accuracy": 0.7793391734361649, |
| "num_tokens": 2007024978.0, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.9981991252894263, |
| "grad_norm": 0.20385591685771942, |
| "learning_rate": 1.6963259930163104e-05, |
| "loss": 0.7442, |
| "mean_token_accuracy": 0.7783006697893142, |
| "num_tokens": 2017443954.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.003087213789555, |
| "grad_norm": 0.24495287239551544, |
| "learning_rate": 1.692900861554457e-05, |
| "loss": 0.7389, |
| "mean_token_accuracy": 0.779910335415288, |
| "num_tokens": 2027325031.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.0082325701054797, |
| "grad_norm": 0.2038143426179886, |
| "learning_rate": 1.68946148761395e-05, |
| "loss": 0.7429, |
| "mean_token_accuracy": 0.7774323493242263, |
| "num_tokens": 2037733216.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.0133779264214047, |
| "grad_norm": 0.19667111337184906, |
| "learning_rate": 1.6860079817956353e-05, |
| "loss": 0.7276, |
| "mean_token_accuracy": 0.7817776888608933, |
| "num_tokens": 2048163457.0, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.0185232827373296, |
| "grad_norm": 0.2280578911304474, |
| "learning_rate": 1.682540455154801e-05, |
| "loss": 0.7156, |
| "mean_token_accuracy": 0.7847582966089248, |
| "num_tokens": 2058559958.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.0236686390532543, |
| "grad_norm": 0.20857077836990356, |
| "learning_rate": 1.6790590191976068e-05, |
| "loss": 0.7335, |
| "mean_token_accuracy": 0.7803497105836869, |
| "num_tokens": 2068976029.0, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.0288139953691793, |
| "grad_norm": 0.23610693216323853, |
| "learning_rate": 1.6755637858774986e-05, |
| "loss": 0.7416, |
| "mean_token_accuracy": 0.7778135746717453, |
| "num_tokens": 2079378100.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.0339593516851042, |
| "grad_norm": 0.22110521793365479, |
| "learning_rate": 1.6720548675916058e-05, |
| "loss": 0.7354, |
| "mean_token_accuracy": 0.7797701776027679, |
| "num_tokens": 2089780930.0, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.0391047080010292, |
| "grad_norm": 0.2105809450149536, |
| "learning_rate": 1.6685323771771306e-05, |
| "loss": 0.727, |
| "mean_token_accuracy": 0.7818355768918991, |
| "num_tokens": 2100188750.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.0442500643169539, |
| "grad_norm": 0.20174016058444977, |
| "learning_rate": 1.664996427907717e-05, |
| "loss": 0.7327, |
| "mean_token_accuracy": 0.7801722586154938, |
| "num_tokens": 2110572072.0, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.0493954206328788, |
| "grad_norm": 0.1835939884185791, |
| "learning_rate": 1.6614471334898086e-05, |
| "loss": 0.7334, |
| "mean_token_accuracy": 0.7800804376602173, |
| "num_tokens": 2120973633.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.0545407769488038, |
| "grad_norm": 0.21300305426120758, |
| "learning_rate": 1.6578846080589934e-05, |
| "loss": 0.7299, |
| "mean_token_accuracy": 0.7812818288803101, |
| "num_tokens": 2131360621.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.0596861332647285, |
| "grad_norm": 0.1908658891916275, |
| "learning_rate": 1.6543089661763315e-05, |
| "loss": 0.7223, |
| "mean_token_accuracy": 0.7834197998046875, |
| "num_tokens": 2141793569.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.0648314895806534, |
| "grad_norm": 0.19039925932884216, |
| "learning_rate": 1.650720322824672e-05, |
| "loss": 0.7361, |
| "mean_token_accuracy": 0.779027310013771, |
| "num_tokens": 2152174814.0, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.0699768458965784, |
| "grad_norm": 0.2063397467136383, |
| "learning_rate": 1.6471187934049574e-05, |
| "loss": 0.7237, |
| "mean_token_accuracy": 0.7826652109622956, |
| "num_tokens": 2162584314.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.0751222022125033, |
| "grad_norm": 0.20498026907444, |
| "learning_rate": 1.643504493732509e-05, |
| "loss": 0.7346, |
| "mean_token_accuracy": 0.7796628832817077, |
| "num_tokens": 2172965246.0, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.080267558528428, |
| "grad_norm": 0.23134800791740417, |
| "learning_rate": 1.639877540033305e-05, |
| "loss": 0.7271, |
| "mean_token_accuracy": 0.7816483587026596, |
| "num_tokens": 2183370435.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.085412914844353, |
| "grad_norm": 0.19139111042022705, |
| "learning_rate": 1.6362380489402433e-05, |
| "loss": 0.7228, |
| "mean_token_accuracy": 0.7829385250806808, |
| "num_tokens": 2193756185.0, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.090558271160278, |
| "grad_norm": 0.23847441375255585, |
| "learning_rate": 1.6325861374893885e-05, |
| "loss": 0.7357, |
| "mean_token_accuracy": 0.7798227697610856, |
| "num_tokens": 2204144787.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.0957036274762026, |
| "grad_norm": 0.19361141324043274, |
| "learning_rate": 1.6289219231162107e-05, |
| "loss": 0.7323, |
| "mean_token_accuracy": 0.7803295195102692, |
| "num_tokens": 2214555301.0, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.1008489837921276, |
| "grad_norm": 0.2365255057811737, |
| "learning_rate": 1.6252455236518088e-05, |
| "loss": 0.7223, |
| "mean_token_accuracy": 0.7834222823381424, |
| "num_tokens": 2224964461.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.1059943401080525, |
| "grad_norm": 0.22614383697509766, |
| "learning_rate": 1.6215570573191203e-05, |
| "loss": 0.7324, |
| "mean_token_accuracy": 0.7800393998622894, |
| "num_tokens": 2235385359.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.1111396964239773, |
| "grad_norm": 0.1952890157699585, |
| "learning_rate": 1.6178566427291196e-05, |
| "loss": 0.7361, |
| "mean_token_accuracy": 0.7795492619276047, |
| "num_tokens": 2245790049.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.1162850527399022, |
| "grad_norm": 0.26438701152801514, |
| "learning_rate": 1.614144398877006e-05, |
| "loss": 0.733, |
| "mean_token_accuracy": 0.7802164793014527, |
| "num_tokens": 2256182680.0, |
| "step": 1085 |
| }, |
| { |
| "epoch": 1.1214304090558271, |
| "grad_norm": 0.19735948741436005, |
| "learning_rate": 1.610420445138373e-05, |
| "loss": 0.7238, |
| "mean_token_accuracy": 0.7827324986457824, |
| "num_tokens": 2266607807.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.126575765371752, |
| "grad_norm": 0.20691247284412384, |
| "learning_rate": 1.6066849012653745e-05, |
| "loss": 0.727, |
| "mean_token_accuracy": 0.7817386299371719, |
| "num_tokens": 2277008491.0, |
| "step": 1095 |
| }, |
| { |
| "epoch": 1.1317211216876768, |
| "grad_norm": 0.22432605922222137, |
| "learning_rate": 1.6029378873828695e-05, |
| "loss": 0.7298, |
| "mean_token_accuracy": 0.7808958977460861, |
| "num_tokens": 2287392752.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.1368664780036017, |
| "grad_norm": 0.20151114463806152, |
| "learning_rate": 1.599179523984562e-05, |
| "loss": 0.737, |
| "mean_token_accuracy": 0.7791817605495452, |
| "num_tokens": 2297803721.0, |
| "step": 1105 |
| }, |
| { |
| "epoch": 1.1420118343195267, |
| "grad_norm": 0.2306589037179947, |
| "learning_rate": 1.5954099319291256e-05, |
| "loss": 0.7325, |
| "mean_token_accuracy": 0.7801610469818115, |
| "num_tokens": 2308187255.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.1471571906354514, |
| "grad_norm": 0.1993226855993271, |
| "learning_rate": 1.5916292324363156e-05, |
| "loss": 0.7251, |
| "mean_token_accuracy": 0.7821116268634796, |
| "num_tokens": 2318589605.0, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.1523025469513763, |
| "grad_norm": 0.22113800048828125, |
| "learning_rate": 1.5878375470830737e-05, |
| "loss": 0.743, |
| "mean_token_accuracy": 0.7773738950490952, |
| "num_tokens": 2328989752.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.1574479032673013, |
| "grad_norm": 0.2003169059753418, |
| "learning_rate": 1.584034997799615e-05, |
| "loss": 0.728, |
| "mean_token_accuracy": 0.7815322816371918, |
| "num_tokens": 2339418236.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.1625932595832262, |
| "grad_norm": 0.2361934781074524, |
| "learning_rate": 1.5802217068655103e-05, |
| "loss": 0.7198, |
| "mean_token_accuracy": 0.7839639008045196, |
| "num_tokens": 2349792828.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.167738615899151, |
| "grad_norm": 0.21388159692287445, |
| "learning_rate": 1.5763977969057514e-05, |
| "loss": 0.7327, |
| "mean_token_accuracy": 0.7798545330762863, |
| "num_tokens": 2360200634.0, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.172883972215076, |
| "grad_norm": 0.20029175281524658, |
| "learning_rate": 1.5725633908868098e-05, |
| "loss": 0.7338, |
| "mean_token_accuracy": 0.7797937542200089, |
| "num_tokens": 2370596817.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.1780293285310008, |
| "grad_norm": 0.19658546149730682, |
| "learning_rate": 1.568718612112681e-05, |
| "loss": 0.721, |
| "mean_token_accuracy": 0.7830342799425125, |
| "num_tokens": 2380988279.0, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.1831746848469256, |
| "grad_norm": 0.23174834251403809, |
| "learning_rate": 1.5648635842209197e-05, |
| "loss": 0.7311, |
| "mean_token_accuracy": 0.7808991014957428, |
| "num_tokens": 2391412429.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.1883200411628505, |
| "grad_norm": 0.21775346994400024, |
| "learning_rate": 1.5609984311786645e-05, |
| "loss": 0.729, |
| "mean_token_accuracy": 0.7810937970876694, |
| "num_tokens": 2401771777.0, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.1934653974787754, |
| "grad_norm": 0.21019048988819122, |
| "learning_rate": 1.5571232772786517e-05, |
| "loss": 0.7253, |
| "mean_token_accuracy": 0.7820482671260833, |
| "num_tokens": 2412159833.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.1986107537947004, |
| "grad_norm": 0.20135393738746643, |
| "learning_rate": 1.553238247135216e-05, |
| "loss": 0.7276, |
| "mean_token_accuracy": 0.7814355909824371, |
| "num_tokens": 2422558752.0, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.203756110110625, |
| "grad_norm": 0.18914781510829926, |
| "learning_rate": 1.549343465680287e-05, |
| "loss": 0.731, |
| "mean_token_accuracy": 0.7803467661142349, |
| "num_tokens": 2432969997.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.20890146642655, |
| "grad_norm": 0.21189342439174652, |
| "learning_rate": 1.5454390581593687e-05, |
| "loss": 0.7262, |
| "mean_token_accuracy": 0.7820929646492004, |
| "num_tokens": 2443373443.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.214046822742475, |
| "grad_norm": 0.1938139647245407, |
| "learning_rate": 1.541525150127513e-05, |
| "loss": 0.721, |
| "mean_token_accuracy": 0.7835055589675903, |
| "num_tokens": 2453770770.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.2191921790583997, |
| "grad_norm": 0.21657834947109222, |
| "learning_rate": 1.537601867445283e-05, |
| "loss": 0.7319, |
| "mean_token_accuracy": 0.7800745993852616, |
| "num_tokens": 2464158108.0, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.2243375353743247, |
| "grad_norm": 0.20266316831111908, |
| "learning_rate": 1.5336693362747036e-05, |
| "loss": 0.7274, |
| "mean_token_accuracy": 0.7812938541173935, |
| "num_tokens": 2474574860.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.2294828916902496, |
| "grad_norm": 0.21585378050804138, |
| "learning_rate": 1.5297276830752074e-05, |
| "loss": 0.7262, |
| "mean_token_accuracy": 0.7819346249103546, |
| "num_tokens": 2484977909.0, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.2346282480061745, |
| "grad_norm": 0.22400428354740143, |
| "learning_rate": 1.5257770345995648e-05, |
| "loss": 0.7325, |
| "mean_token_accuracy": 0.7801924586296082, |
| "num_tokens": 2495371556.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.2397736043220993, |
| "grad_norm": 0.2023976743221283, |
| "learning_rate": 1.5218175178898106e-05, |
| "loss": 0.7202, |
| "mean_token_accuracy": 0.7832947343587875, |
| "num_tokens": 2505752888.0, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.2449189606380242, |
| "grad_norm": 0.22408372163772583, |
| "learning_rate": 1.5178492602731581e-05, |
| "loss": 0.7254, |
| "mean_token_accuracy": 0.7819278568029404, |
| "num_tokens": 2516182631.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.250064316953949, |
| "grad_norm": 0.24158449470996857, |
| "learning_rate": 1.5138723893579028e-05, |
| "loss": 0.7296, |
| "mean_token_accuracy": 0.781261432170868, |
| "num_tokens": 2526605123.0, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.2552096732698739, |
| "grad_norm": 0.2226988673210144, |
| "learning_rate": 1.5098870330293218e-05, |
| "loss": 0.7171, |
| "mean_token_accuracy": 0.7841993749141694, |
| "num_tokens": 2537002245.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.2603550295857988, |
| "grad_norm": 0.23769278824329376, |
| "learning_rate": 1.505893319445559e-05, |
| "loss": 0.7325, |
| "mean_token_accuracy": 0.7801701694726944, |
| "num_tokens": 2547440043.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.2655003859017238, |
| "grad_norm": 0.1922103464603424, |
| "learning_rate": 1.5018913770335046e-05, |
| "loss": 0.7274, |
| "mean_token_accuracy": 0.781620192527771, |
| "num_tokens": 2557858544.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.2706457422176487, |
| "grad_norm": 0.18663917481899261, |
| "learning_rate": 1.4978813344846661e-05, |
| "loss": 0.7391, |
| "mean_token_accuracy": 0.7782456696033477, |
| "num_tokens": 2568263241.0, |
| "step": 1235 |
| }, |
| { |
| "epoch": 1.2757910985335734, |
| "grad_norm": 0.19044432044029236, |
| "learning_rate": 1.4938633207510287e-05, |
| "loss": 0.7267, |
| "mean_token_accuracy": 0.7819855302572251, |
| "num_tokens": 2578685798.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.2809364548494984, |
| "grad_norm": 0.2261897772550583, |
| "learning_rate": 1.4898374650409094e-05, |
| "loss": 0.7266, |
| "mean_token_accuracy": 0.7815109491348267, |
| "num_tokens": 2589079528.0, |
| "step": 1245 |
| }, |
| { |
| "epoch": 1.286081811165423, |
| "grad_norm": 0.27746737003326416, |
| "learning_rate": 1.485803896814801e-05, |
| "loss": 0.7255, |
| "mean_token_accuracy": 0.7821037322282791, |
| "num_tokens": 2599488251.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.291227167481348, |
| "grad_norm": 0.21356722712516785, |
| "learning_rate": 1.4817627457812107e-05, |
| "loss": 0.7218, |
| "mean_token_accuracy": 0.7831762701272964, |
| "num_tokens": 2609884395.0, |
| "step": 1255 |
| }, |
| { |
| "epoch": 1.296372523797273, |
| "grad_norm": 0.1928669810295105, |
| "learning_rate": 1.4777141418924874e-05, |
| "loss": 0.7222, |
| "mean_token_accuracy": 0.7831795990467072, |
| "num_tokens": 2620262838.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.301517880113198, |
| "grad_norm": 0.20577941834926605, |
| "learning_rate": 1.4736582153406431e-05, |
| "loss": 0.7345, |
| "mean_token_accuracy": 0.7796419382095336, |
| "num_tokens": 2630615363.0, |
| "step": 1265 |
| }, |
| { |
| "epoch": 1.3066632364291229, |
| "grad_norm": 0.2071027308702469, |
| "learning_rate": 1.4695950965531679e-05, |
| "loss": 0.7207, |
| "mean_token_accuracy": 0.7833283305168152, |
| "num_tokens": 2641010705.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.3118085927450476, |
| "grad_norm": 0.20452018082141876, |
| "learning_rate": 1.4655249161888322e-05, |
| "loss": 0.7219, |
| "mean_token_accuracy": 0.7826146423816681, |
| "num_tokens": 2651367538.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.3169539490609725, |
| "grad_norm": 0.19734811782836914, |
| "learning_rate": 1.46144780513349e-05, |
| "loss": 0.7293, |
| "mean_token_accuracy": 0.7808092325925827, |
| "num_tokens": 2661752746.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.3220993053768972, |
| "grad_norm": 0.2132222205400467, |
| "learning_rate": 1.4573638944958654e-05, |
| "loss": 0.7213, |
| "mean_token_accuracy": 0.783295625448227, |
| "num_tokens": 2672170022.0, |
| "step": 1285 |
| }, |
| { |
| "epoch": 1.3272446616928222, |
| "grad_norm": 0.20818860828876495, |
| "learning_rate": 1.4532733156033399e-05, |
| "loss": 0.7257, |
| "mean_token_accuracy": 0.7817448288202286, |
| "num_tokens": 2682534056.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.3323900180087471, |
| "grad_norm": 0.21955865621566772, |
| "learning_rate": 1.449176199997726e-05, |
| "loss": 0.7227, |
| "mean_token_accuracy": 0.7827985614538193, |
| "num_tokens": 2692942602.0, |
| "step": 1295 |
| }, |
| { |
| "epoch": 1.337535374324672, |
| "grad_norm": 0.2140192687511444, |
| "learning_rate": 1.4450726794310408e-05, |
| "loss": 0.7245, |
| "mean_token_accuracy": 0.7822914987802505, |
| "num_tokens": 2703334544.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.3426807306405968, |
| "grad_norm": 0.19470317661762238, |
| "learning_rate": 1.4409628858612665e-05, |
| "loss": 0.725, |
| "mean_token_accuracy": 0.7822528421878815, |
| "num_tokens": 2713729545.0, |
| "step": 1305 |
| }, |
| { |
| "epoch": 1.3478260869565217, |
| "grad_norm": 0.19224713742733002, |
| "learning_rate": 1.4368469514481083e-05, |
| "loss": 0.7159, |
| "mean_token_accuracy": 0.784244379401207, |
| "num_tokens": 2724133639.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.3529714432724467, |
| "grad_norm": 0.19122512638568878, |
| "learning_rate": 1.4327250085487435e-05, |
| "loss": 0.7318, |
| "mean_token_accuracy": 0.7805237233638763, |
| "num_tokens": 2734527785.0, |
| "step": 1315 |
| }, |
| { |
| "epoch": 1.3581167995883714, |
| "grad_norm": 0.19252869486808777, |
| "learning_rate": 1.428597189713566e-05, |
| "loss": 0.721, |
| "mean_token_accuracy": 0.7833033174276351, |
| "num_tokens": 2744932620.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.3632621559042963, |
| "grad_norm": 0.20513495802879333, |
| "learning_rate": 1.4244636276819247e-05, |
| "loss": 0.7288, |
| "mean_token_accuracy": 0.7811249732971192, |
| "num_tokens": 2755296881.0, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.3684075122202213, |
| "grad_norm": 0.21072103083133698, |
| "learning_rate": 1.4203244553778523e-05, |
| "loss": 0.7267, |
| "mean_token_accuracy": 0.781619307398796, |
| "num_tokens": 2765692282.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.3735528685361462, |
| "grad_norm": 0.23436151444911957, |
| "learning_rate": 1.4161798059057942e-05, |
| "loss": 0.7221, |
| "mean_token_accuracy": 0.7828515231609344, |
| "num_tokens": 2776081454.0, |
| "step": 1335 |
| }, |
| { |
| "epoch": 1.378698224852071, |
| "grad_norm": 0.19753047823905945, |
| "learning_rate": 1.4120298125463252e-05, |
| "loss": 0.73, |
| "mean_token_accuracy": 0.7808351576328277, |
| "num_tokens": 2786502061.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.3838435811679959, |
| "grad_norm": 0.2203502655029297, |
| "learning_rate": 1.4078746087518655e-05, |
| "loss": 0.7337, |
| "mean_token_accuracy": 0.7793877094984054, |
| "num_tokens": 2796907592.0, |
| "step": 1345 |
| }, |
| { |
| "epoch": 1.3889889374839208, |
| "grad_norm": 0.19259704649448395, |
| "learning_rate": 1.4037143281423885e-05, |
| "loss": 0.7254, |
| "mean_token_accuracy": 0.7815340638160706, |
| "num_tokens": 2807315293.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.3941342937998455, |
| "grad_norm": 0.20095007121562958, |
| "learning_rate": 1.3995491045011243e-05, |
| "loss": 0.7248, |
| "mean_token_accuracy": 0.7817043244838715, |
| "num_tokens": 2817690875.0, |
| "step": 1355 |
| }, |
| { |
| "epoch": 1.3992796501157705, |
| "grad_norm": 0.1883144974708557, |
| "learning_rate": 1.395379071770257e-05, |
| "loss": 0.7207, |
| "mean_token_accuracy": 0.7828416913747788, |
| "num_tokens": 2828099023.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.4044250064316954, |
| "grad_norm": 0.18824101984500885, |
| "learning_rate": 1.3912043640466175e-05, |
| "loss": 0.7194, |
| "mean_token_accuracy": 0.7835315078496933, |
| "num_tokens": 2838513670.0, |
| "step": 1365 |
| }, |
| { |
| "epoch": 1.4095703627476204, |
| "grad_norm": 0.19366908073425293, |
| "learning_rate": 1.387025115577373e-05, |
| "loss": 0.74, |
| "mean_token_accuracy": 0.7780154138803482, |
| "num_tokens": 2848908140.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.414715719063545, |
| "grad_norm": 0.18834272027015686, |
| "learning_rate": 1.382841460755707e-05, |
| "loss": 0.7279, |
| "mean_token_accuracy": 0.7807209342718124, |
| "num_tokens": 2859277820.0, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.41986107537947, |
| "grad_norm": 0.20872853696346283, |
| "learning_rate": 1.378653534116501e-05, |
| "loss": 0.7174, |
| "mean_token_accuracy": 0.7837792187929153, |
| "num_tokens": 2869692721.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.425006431695395, |
| "grad_norm": 0.21112094819545746, |
| "learning_rate": 1.3744614703320046e-05, |
| "loss": 0.7229, |
| "mean_token_accuracy": 0.7826662242412568, |
| "num_tokens": 2880093189.0, |
| "step": 1385 |
| }, |
| { |
| "epoch": 1.4301517880113197, |
| "grad_norm": 0.18763834238052368, |
| "learning_rate": 1.3702654042075077e-05, |
| "loss": 0.7244, |
| "mean_token_accuracy": 0.7820846647024154, |
| "num_tokens": 2890505046.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.4352971443272446, |
| "grad_norm": 0.1809283047914505, |
| "learning_rate": 1.3660654706770045e-05, |
| "loss": 0.7304, |
| "mean_token_accuracy": 0.7803399920463562, |
| "num_tokens": 2900869962.0, |
| "step": 1395 |
| }, |
| { |
| "epoch": 1.4404425006431696, |
| "grad_norm": 0.19696731865406036, |
| "learning_rate": 1.3618618047988541e-05, |
| "loss": 0.7229, |
| "mean_token_accuracy": 0.7826083064079284, |
| "num_tokens": 2911235205.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.4455878569590945, |
| "grad_norm": 0.21456697583198547, |
| "learning_rate": 1.3576545417514379e-05, |
| "loss": 0.7238, |
| "mean_token_accuracy": 0.7821739882230758, |
| "num_tokens": 2921630626.0, |
| "step": 1405 |
| }, |
| { |
| "epoch": 1.4507332132750193, |
| "grad_norm": 0.2006523162126541, |
| "learning_rate": 1.3534438168288122e-05, |
| "loss": 0.7236, |
| "mean_token_accuracy": 0.7829265475273133, |
| "num_tokens": 2931987655.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.4558785695909442, |
| "grad_norm": 0.20633164048194885, |
| "learning_rate": 1.3492297654363582e-05, |
| "loss": 0.7303, |
| "mean_token_accuracy": 0.7806042492389679, |
| "num_tokens": 2942398851.0, |
| "step": 1415 |
| }, |
| { |
| "epoch": 1.461023925906869, |
| "grad_norm": 0.18342465162277222, |
| "learning_rate": 1.3450125230864265e-05, |
| "loss": 0.7221, |
| "mean_token_accuracy": 0.7832267910242081, |
| "num_tokens": 2952797589.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.4661692822227939, |
| "grad_norm": 0.18584422767162323, |
| "learning_rate": 1.3407922253939801e-05, |
| "loss": 0.7207, |
| "mean_token_accuracy": 0.7827514231204986, |
| "num_tokens": 2963142662.0, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.4713146385387188, |
| "grad_norm": 0.2012760490179062, |
| "learning_rate": 1.3365690080722349e-05, |
| "loss": 0.7228, |
| "mean_token_accuracy": 0.7825741022825241, |
| "num_tokens": 2973551267.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.4764599948546437, |
| "grad_norm": 0.1869860738515854, |
| "learning_rate": 1.3323430069282922e-05, |
| "loss": 0.7123, |
| "mean_token_accuracy": 0.7854143947362899, |
| "num_tokens": 2983967802.0, |
| "step": 1435 |
| }, |
| { |
| "epoch": 1.4816053511705687, |
| "grad_norm": 0.18766240775585175, |
| "learning_rate": 1.3281143578587747e-05, |
| "loss": 0.7204, |
| "mean_token_accuracy": 0.7834302335977554, |
| "num_tokens": 2994361017.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.4867507074864934, |
| "grad_norm": 0.20129527151584625, |
| "learning_rate": 1.3238831968454547e-05, |
| "loss": 0.7295, |
| "mean_token_accuracy": 0.7809592038393021, |
| "num_tokens": 3004789429.0, |
| "step": 1445 |
| }, |
| { |
| "epoch": 1.4918960638024183, |
| "grad_norm": 0.1840723752975464, |
| "learning_rate": 1.3196496599508818e-05, |
| "loss": 0.7164, |
| "mean_token_accuracy": 0.7845001757144928, |
| "num_tokens": 3015178802.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.497041420118343, |
| "grad_norm": 0.18060751259326935, |
| "learning_rate": 1.3154138833140066e-05, |
| "loss": 0.7194, |
| "mean_token_accuracy": 0.7834681749343873, |
| "num_tokens": 3025590677.0, |
| "step": 1455 |
| }, |
| { |
| "epoch": 1.502186776434268, |
| "grad_norm": 0.20038992166519165, |
| "learning_rate": 1.3111760031458056e-05, |
| "loss": 0.7234, |
| "mean_token_accuracy": 0.7822674155235291, |
| "num_tokens": 3036005732.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.507332132750193, |
| "grad_norm": 0.17819659411907196, |
| "learning_rate": 1.3069361557248972e-05, |
| "loss": 0.7255, |
| "mean_token_accuracy": 0.781733363866806, |
| "num_tokens": 3046403093.0, |
| "step": 1465 |
| }, |
| { |
| "epoch": 1.512477489066118, |
| "grad_norm": 0.18325041234493256, |
| "learning_rate": 1.3026944773931623e-05, |
| "loss": 0.7241, |
| "mean_token_accuracy": 0.7819523394107819, |
| "num_tokens": 3056805589.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.5176228453820428, |
| "grad_norm": 0.1812821924686432, |
| "learning_rate": 1.2984511045513583e-05, |
| "loss": 0.7336, |
| "mean_token_accuracy": 0.7796476185321808, |
| "num_tokens": 3067202755.0, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.5227682016979676, |
| "grad_norm": 0.1757994294166565, |
| "learning_rate": 1.2942061736547338e-05, |
| "loss": 0.7252, |
| "mean_token_accuracy": 0.7821748554706573, |
| "num_tokens": 3077623015.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.5279135580138925, |
| "grad_norm": 0.18919821083545685, |
| "learning_rate": 1.2899598212086407e-05, |
| "loss": 0.7224, |
| "mean_token_accuracy": 0.7825713455677032, |
| "num_tokens": 3088009778.0, |
| "step": 1485 |
| }, |
| { |
| "epoch": 1.5330589143298172, |
| "grad_norm": 0.18530713021755219, |
| "learning_rate": 1.285712183764142e-05, |
| "loss": 0.7308, |
| "mean_token_accuracy": 0.7805260062217713, |
| "num_tokens": 3098430248.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.5382042706457422, |
| "grad_norm": 0.1960536390542984, |
| "learning_rate": 1.2814633979136254e-05, |
| "loss": 0.7224, |
| "mean_token_accuracy": 0.7830262005329132, |
| "num_tokens": 3108837811.0, |
| "step": 1495 |
| }, |
| { |
| "epoch": 1.543349626961667, |
| "grad_norm": 0.19076813757419586, |
| "learning_rate": 1.2772136002864067e-05, |
| "loss": 0.7221, |
| "mean_token_accuracy": 0.783091539144516, |
| "num_tokens": 3119231467.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.548494983277592, |
| "grad_norm": 0.20653417706489563, |
| "learning_rate": 1.2729629275443373e-05, |
| "loss": 0.7256, |
| "mean_token_accuracy": 0.781879261136055, |
| "num_tokens": 3129640782.0, |
| "step": 1505 |
| }, |
| { |
| "epoch": 1.553640339593517, |
| "grad_norm": 0.19807811081409454, |
| "learning_rate": 1.268711516377411e-05, |
| "loss": 0.7156, |
| "mean_token_accuracy": 0.7845076858997345, |
| "num_tokens": 3140041502.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.5587856959094417, |
| "grad_norm": 0.18549486994743347, |
| "learning_rate": 1.2644595034993667e-05, |
| "loss": 0.7145, |
| "mean_token_accuracy": 0.7843082189559937, |
| "num_tokens": 3150443408.0, |
| "step": 1515 |
| }, |
| { |
| "epoch": 1.5639310522253667, |
| "grad_norm": 0.18279866874217987, |
| "learning_rate": 1.260207025643293e-05, |
| "loss": 0.7247, |
| "mean_token_accuracy": 0.7818575918674469, |
| "num_tokens": 3160858821.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.5690764085412914, |
| "grad_norm": 0.19264079630374908, |
| "learning_rate": 1.25595421955723e-05, |
| "loss": 0.7373, |
| "mean_token_accuracy": 0.7786633670330048, |
| "num_tokens": 3171243172.0, |
| "step": 1525 |
| }, |
| { |
| "epoch": 1.5742217648572163, |
| "grad_norm": 0.20798954367637634, |
| "learning_rate": 1.2517012219997743e-05, |
| "loss": 0.7263, |
| "mean_token_accuracy": 0.7813588201999664, |
| "num_tokens": 3181666806.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.5793671211731413, |
| "grad_norm": 0.22132764756679535, |
| "learning_rate": 1.2474481697356784e-05, |
| "loss": 0.7158, |
| "mean_token_accuracy": 0.7841507345438004, |
| "num_tokens": 3192068095.0, |
| "step": 1535 |
| }, |
| { |
| "epoch": 1.5845124774890662, |
| "grad_norm": 0.1778470277786255, |
| "learning_rate": 1.2431951995314547e-05, |
| "loss": 0.7112, |
| "mean_token_accuracy": 0.7860257804393769, |
| "num_tokens": 3202473819.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.5896578338049911, |
| "grad_norm": 0.20948489010334015, |
| "learning_rate": 1.2389424481509766e-05, |
| "loss": 0.7283, |
| "mean_token_accuracy": 0.7805652767419815, |
| "num_tokens": 3212838888.0, |
| "step": 1545 |
| }, |
| { |
| "epoch": 1.5948031901209159, |
| "grad_norm": 0.22574825584888458, |
| "learning_rate": 1.2346900523510804e-05, |
| "loss": 0.7246, |
| "mean_token_accuracy": 0.7815930396318436, |
| "num_tokens": 3223222863.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.5999485464368406, |
| "grad_norm": 0.19260616600513458, |
| "learning_rate": 1.2304381488771684e-05, |
| "loss": 0.7192, |
| "mean_token_accuracy": 0.7832415938377381, |
| "num_tokens": 3233636338.0, |
| "step": 1555 |
| }, |
| { |
| "epoch": 1.6050939027527655, |
| "grad_norm": 0.2285853922367096, |
| "learning_rate": 1.2261868744588108e-05, |
| "loss": 0.7123, |
| "mean_token_accuracy": 0.7852210611104965, |
| "num_tokens": 3244050232.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.6102392590686905, |
| "grad_norm": 0.21371160447597504, |
| "learning_rate": 1.2219363658053496e-05, |
| "loss": 0.7236, |
| "mean_token_accuracy": 0.7823358774185181, |
| "num_tokens": 3254430770.0, |
| "step": 1565 |
| }, |
| { |
| "epoch": 1.6153846153846154, |
| "grad_norm": 0.20527417957782745, |
| "learning_rate": 1.217686759601501e-05, |
| "loss": 0.7147, |
| "mean_token_accuracy": 0.7844323009252548, |
| "num_tokens": 3264847891.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.6205299717005404, |
| "grad_norm": 0.18310104310512543, |
| "learning_rate": 1.2134381925029613e-05, |
| "loss": 0.725, |
| "mean_token_accuracy": 0.7815193057060241, |
| "num_tokens": 3275273883.0, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.6256753280164653, |
| "grad_norm": 0.18352478742599487, |
| "learning_rate": 1.209190801132012e-05, |
| "loss": 0.7174, |
| "mean_token_accuracy": 0.7840063363313675, |
| "num_tokens": 3285662067.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.63082068433239, |
| "grad_norm": 0.1829749196767807, |
| "learning_rate": 1.2049447220731266e-05, |
| "loss": 0.7256, |
| "mean_token_accuracy": 0.7813293248414993, |
| "num_tokens": 3296058473.0, |
| "step": 1585 |
| }, |
| { |
| "epoch": 1.6359660406483147, |
| "grad_norm": 0.18960633873939514, |
| "learning_rate": 1.2007000918685786e-05, |
| "loss": 0.7283, |
| "mean_token_accuracy": 0.780817398428917, |
| "num_tokens": 3306487200.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.6411113969642397, |
| "grad_norm": 0.18444406986236572, |
| "learning_rate": 1.196457047014049e-05, |
| "loss": 0.7166, |
| "mean_token_accuracy": 0.7841069996356964, |
| "num_tokens": 3316886128.0, |
| "step": 1595 |
| }, |
| { |
| "epoch": 1.6462567532801646, |
| "grad_norm": 0.18856759369373322, |
| "learning_rate": 1.1922157239542396e-05, |
| "loss": 0.7224, |
| "mean_token_accuracy": 0.7824205875396728, |
| "num_tokens": 3327276884.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.6514021095960896, |
| "grad_norm": 0.18209627270698547, |
| "learning_rate": 1.1879762590784832e-05, |
| "loss": 0.7097, |
| "mean_token_accuracy": 0.7861823856830596, |
| "num_tokens": 3337639665.0, |
| "step": 1605 |
| }, |
| { |
| "epoch": 1.6565474659120145, |
| "grad_norm": 0.18820519745349884, |
| "learning_rate": 1.1837387887163594e-05, |
| "loss": 0.7186, |
| "mean_token_accuracy": 0.783584377169609, |
| "num_tokens": 3348052567.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.6616928222279392, |
| "grad_norm": 0.2538444399833679, |
| "learning_rate": 1.1795034491333089e-05, |
| "loss": 0.7258, |
| "mean_token_accuracy": 0.7812894821166992, |
| "num_tokens": 3358442738.0, |
| "step": 1615 |
| }, |
| { |
| "epoch": 1.6668381785438642, |
| "grad_norm": 0.22863799333572388, |
| "learning_rate": 1.175270376526252e-05, |
| "loss": 0.7285, |
| "mean_token_accuracy": 0.7804809838533402, |
| "num_tokens": 3368845776.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.671983534859789, |
| "grad_norm": 0.1943485289812088, |
| "learning_rate": 1.1710397070192103e-05, |
| "loss": 0.7131, |
| "mean_token_accuracy": 0.7848410785198212, |
| "num_tokens": 3379247751.0, |
| "step": 1625 |
| }, |
| { |
| "epoch": 1.6771288911757138, |
| "grad_norm": 0.25817155838012695, |
| "learning_rate": 1.1668115766589278e-05, |
| "loss": 0.7209, |
| "mean_token_accuracy": 0.7828822433948517, |
| "num_tokens": 3389594723.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.6822742474916388, |
| "grad_norm": 0.21201983094215393, |
| "learning_rate": 1.1625861214104967e-05, |
| "loss": 0.7148, |
| "mean_token_accuracy": 0.7844018071889878, |
| "num_tokens": 3399983766.0, |
| "step": 1635 |
| }, |
| { |
| "epoch": 1.6874196038075637, |
| "grad_norm": 0.20084577798843384, |
| "learning_rate": 1.1583634771529843e-05, |
| "loss": 0.7167, |
| "mean_token_accuracy": 0.783909472823143, |
| "num_tokens": 3410348245.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.6925649601234887, |
| "grad_norm": 0.19142308831214905, |
| "learning_rate": 1.1541437796750651e-05, |
| "loss": 0.7216, |
| "mean_token_accuracy": 0.7828730583190918, |
| "num_tokens": 3420763568.0, |
| "step": 1645 |
| }, |
| { |
| "epoch": 1.6977103164394134, |
| "grad_norm": 0.21393629908561707, |
| "learning_rate": 1.1499271646706525e-05, |
| "loss": 0.7299, |
| "mean_token_accuracy": 0.7805344760417938, |
| "num_tokens": 3431161357.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.7028556727553383, |
| "grad_norm": 0.20122776925563812, |
| "learning_rate": 1.1457137677345362e-05, |
| "loss": 0.7147, |
| "mean_token_accuracy": 0.7848177880048752, |
| "num_tokens": 3441567248.0, |
| "step": 1655 |
| }, |
| { |
| "epoch": 1.708001029071263, |
| "grad_norm": 0.21072812378406525, |
| "learning_rate": 1.1415037243580219e-05, |
| "loss": 0.7069, |
| "mean_token_accuracy": 0.7868409931659699, |
| "num_tokens": 3451983024.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.713146385387188, |
| "grad_norm": 0.19437959790229797, |
| "learning_rate": 1.1372971699245732e-05, |
| "loss": 0.7196, |
| "mean_token_accuracy": 0.7829753428697586, |
| "num_tokens": 3462383943.0, |
| "step": 1665 |
| }, |
| { |
| "epoch": 1.718291741703113, |
| "grad_norm": 0.19020071625709534, |
| "learning_rate": 1.1330942397054599e-05, |
| "loss": 0.7231, |
| "mean_token_accuracy": 0.7823878258466721, |
| "num_tokens": 3472795076.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.7234370980190379, |
| "grad_norm": 0.17885635793209076, |
| "learning_rate": 1.1288950688554068e-05, |
| "loss": 0.7307, |
| "mean_token_accuracy": 0.7804099589586257, |
| "num_tokens": 3483201272.0, |
| "step": 1675 |
| }, |
| { |
| "epoch": 1.7285824543349628, |
| "grad_norm": 0.1998845338821411, |
| "learning_rate": 1.1246997924082465e-05, |
| "loss": 0.7178, |
| "mean_token_accuracy": 0.7835394382476807, |
| "num_tokens": 3493592730.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.7337278106508875, |
| "grad_norm": 0.1758834719657898, |
| "learning_rate": 1.1205085452725796e-05, |
| "loss": 0.7246, |
| "mean_token_accuracy": 0.7821025729179383, |
| "num_tokens": 3504000714.0, |
| "step": 1685 |
| }, |
| { |
| "epoch": 1.7388731669668125, |
| "grad_norm": 0.18512143194675446, |
| "learning_rate": 1.116321462227435e-05, |
| "loss": 0.7217, |
| "mean_token_accuracy": 0.7826696693897247, |
| "num_tokens": 3514403874.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.7440185232827372, |
| "grad_norm": 0.18202784657478333, |
| "learning_rate": 1.112138677917935e-05, |
| "loss": 0.7098, |
| "mean_token_accuracy": 0.7855264693498611, |
| "num_tokens": 3524800705.0, |
| "step": 1695 |
| }, |
| { |
| "epoch": 1.7491638795986622, |
| "grad_norm": 0.17856541275978088, |
| "learning_rate": 1.1079603268509671e-05, |
| "loss": 0.7223, |
| "mean_token_accuracy": 0.7820982217788697, |
| "num_tokens": 3535216074.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.754309235914587, |
| "grad_norm": 0.18560512363910675, |
| "learning_rate": 1.1037865433908574e-05, |
| "loss": 0.7163, |
| "mean_token_accuracy": 0.7839051306247711, |
| "num_tokens": 3545649484.0, |
| "step": 1705 |
| }, |
| { |
| "epoch": 1.759454592230512, |
| "grad_norm": 0.1768876314163208, |
| "learning_rate": 1.0996174617550506e-05, |
| "loss": 0.7147, |
| "mean_token_accuracy": 0.78475923538208, |
| "num_tokens": 3556016373.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.764599948546437, |
| "grad_norm": 0.18054403364658356, |
| "learning_rate": 1.0954532160097937e-05, |
| "loss": 0.7199, |
| "mean_token_accuracy": 0.783240556716919, |
| "num_tokens": 3566403061.0, |
| "step": 1715 |
| }, |
| { |
| "epoch": 1.7697453048623617, |
| "grad_norm": 0.1674446016550064, |
| "learning_rate": 1.0912939400658243e-05, |
| "loss": 0.7223, |
| "mean_token_accuracy": 0.7823190867900849, |
| "num_tokens": 3576811704.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.7748906611782866, |
| "grad_norm": 0.19289781153202057, |
| "learning_rate": 1.0871397676740647e-05, |
| "loss": 0.7268, |
| "mean_token_accuracy": 0.7810219496488571, |
| "num_tokens": 3587228256.0, |
| "step": 1725 |
| }, |
| { |
| "epoch": 1.7800360174942114, |
| "grad_norm": 0.19123966991901398, |
| "learning_rate": 1.0829908324213214e-05, |
| "loss": 0.7203, |
| "mean_token_accuracy": 0.7828882426023483, |
| "num_tokens": 3597638203.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.7851813738101363, |
| "grad_norm": 0.17165765166282654, |
| "learning_rate": 1.0788472677259888e-05, |
| "loss": 0.7237, |
| "mean_token_accuracy": 0.7817003160715104, |
| "num_tokens": 3608044316.0, |
| "step": 1735 |
| }, |
| { |
| "epoch": 1.7903267301260612, |
| "grad_norm": 0.17944514751434326, |
| "learning_rate": 1.074709206833759e-05, |
| "loss": 0.7254, |
| "mean_token_accuracy": 0.7813136577606201, |
| "num_tokens": 3618445618.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.7954720864419862, |
| "grad_norm": 0.19152477383613586, |
| "learning_rate": 1.070576782813336e-05, |
| "loss": 0.7244, |
| "mean_token_accuracy": 0.7815535515546799, |
| "num_tokens": 3628864024.0, |
| "step": 1745 |
| }, |
| { |
| "epoch": 1.8006174427579111, |
| "grad_norm": 0.20791219174861908, |
| "learning_rate": 1.0664501285521585e-05, |
| "loss": 0.7185, |
| "mean_token_accuracy": 0.7832733541727066, |
| "num_tokens": 3639237896.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.8057627990738359, |
| "grad_norm": 0.17812472581863403, |
| "learning_rate": 1.0623293767521248e-05, |
| "loss": 0.7241, |
| "mean_token_accuracy": 0.7819753557443618, |
| "num_tokens": 3649639864.0, |
| "step": 1755 |
| }, |
| { |
| "epoch": 1.8109081553897606, |
| "grad_norm": 0.2049945890903473, |
| "learning_rate": 1.0582146599253271e-05, |
| "loss": 0.7257, |
| "mean_token_accuracy": 0.7812327802181244, |
| "num_tokens": 3660058988.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.8160535117056855, |
| "grad_norm": 0.17638404667377472, |
| "learning_rate": 1.0541061103897881e-05, |
| "loss": 0.7224, |
| "mean_token_accuracy": 0.782220122218132, |
| "num_tokens": 3670478945.0, |
| "step": 1765 |
| }, |
| { |
| "epoch": 1.8211988680216105, |
| "grad_norm": 0.19884805381298065, |
| "learning_rate": 1.0500038602652087e-05, |
| "loss": 0.7231, |
| "mean_token_accuracy": 0.7823522746562958, |
| "num_tokens": 3680853435.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.8263442243375354, |
| "grad_norm": 0.2005850225687027, |
| "learning_rate": 1.0459080414687166e-05, |
| "loss": 0.7109, |
| "mean_token_accuracy": 0.7854242146015167, |
| "num_tokens": 3691255046.0, |
| "step": 1775 |
| }, |
| { |
| "epoch": 1.8314895806534603, |
| "grad_norm": 0.18633082509040833, |
| "learning_rate": 1.041818785710627e-05, |
| "loss": 0.7138, |
| "mean_token_accuracy": 0.7844531148672104, |
| "num_tokens": 3701666509.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.8366349369693853, |
| "grad_norm": 0.20367996394634247, |
| "learning_rate": 1.037736224490205e-05, |
| "loss": 0.7277, |
| "mean_token_accuracy": 0.7811777710914611, |
| "num_tokens": 3712030587.0, |
| "step": 1785 |
| }, |
| { |
| "epoch": 1.84178029328531, |
| "grad_norm": 0.19179487228393555, |
| "learning_rate": 1.033660489091437e-05, |
| "loss": 0.7184, |
| "mean_token_accuracy": 0.7832197934389115, |
| "num_tokens": 3722418473.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.8469256496012347, |
| "grad_norm": 0.1956593543291092, |
| "learning_rate": 1.0295917105788116e-05, |
| "loss": 0.7176, |
| "mean_token_accuracy": 0.7836333483457565, |
| "num_tokens": 3732817985.0, |
| "step": 1795 |
| }, |
| { |
| "epoch": 1.8520710059171597, |
| "grad_norm": 0.18963828682899475, |
| "learning_rate": 1.0255300197931008e-05, |
| "loss": 0.7264, |
| "mean_token_accuracy": 0.781423020362854, |
| "num_tokens": 3743226210.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.8572163622330846, |
| "grad_norm": 0.16611480712890625, |
| "learning_rate": 1.021475547347157e-05, |
| "loss": 0.7201, |
| "mean_token_accuracy": 0.7829820960760117, |
| "num_tokens": 3753595703.0, |
| "step": 1805 |
| }, |
| { |
| "epoch": 1.8623617185490096, |
| "grad_norm": 0.17385919392108917, |
| "learning_rate": 1.017428423621708e-05, |
| "loss": 0.7294, |
| "mean_token_accuracy": 0.7805358350276947, |
| "num_tokens": 3764011843.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.8675070748649345, |
| "grad_norm": 0.18427050113677979, |
| "learning_rate": 1.0133887787611691e-05, |
| "loss": 0.7199, |
| "mean_token_accuracy": 0.7830179333686829, |
| "num_tokens": 3774434254.0, |
| "step": 1815 |
| }, |
| { |
| "epoch": 1.8726524311808592, |
| "grad_norm": 0.18085253238677979, |
| "learning_rate": 1.0093567426694544e-05, |
| "loss": 0.71, |
| "mean_token_accuracy": 0.7860257983207702, |
| "num_tokens": 3784829343.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.8777977874967842, |
| "grad_norm": 0.18546296656131744, |
| "learning_rate": 1.0053324450058017e-05, |
| "loss": 0.7121, |
| "mean_token_accuracy": 0.7851428180932999, |
| "num_tokens": 3795222093.0, |
| "step": 1825 |
| }, |
| { |
| "epoch": 1.8829431438127089, |
| "grad_norm": 0.1925223022699356, |
| "learning_rate": 1.0013160151806019e-05, |
| "loss": 0.7235, |
| "mean_token_accuracy": 0.7820713192224502, |
| "num_tokens": 3805638950.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.8880885001286338, |
| "grad_norm": 0.17734932899475098, |
| "learning_rate": 9.973075823512368e-06, |
| "loss": 0.7232, |
| "mean_token_accuracy": 0.78176209628582, |
| "num_tokens": 3816016677.0, |
| "step": 1835 |
| }, |
| { |
| "epoch": 1.8932338564445588, |
| "grad_norm": 0.1860755831003189, |
| "learning_rate": 9.933072754179285e-06, |
| "loss": 0.7142, |
| "mean_token_accuracy": 0.7846502423286438, |
| "num_tokens": 3826390292.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.8983792127604837, |
| "grad_norm": 0.17822657525539398, |
| "learning_rate": 9.893152230195909e-06, |
| "loss": 0.7158, |
| "mean_token_accuracy": 0.7840298235416412, |
| "num_tokens": 3836783019.0, |
| "step": 1845 |
| }, |
| { |
| "epoch": 1.9035245690764087, |
| "grad_norm": 0.19308920204639435, |
| "learning_rate": 9.85331553529696e-06, |
| "loss": 0.7089, |
| "mean_token_accuracy": 0.7863038212060929, |
| "num_tokens": 3847188499.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.9086699253923334, |
| "grad_norm": 0.18061098456382751, |
| "learning_rate": 9.813563950521435e-06, |
| "loss": 0.7116, |
| "mean_token_accuracy": 0.7850539714097977, |
| "num_tokens": 3857595213.0, |
| "step": 1855 |
| }, |
| { |
| "epoch": 1.9138152817082583, |
| "grad_norm": 0.18107041716575623, |
| "learning_rate": 9.773898754171425e-06, |
| "loss": 0.7219, |
| "mean_token_accuracy": 0.7823956727981567, |
| "num_tokens": 3867991091.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.918960638024183, |
| "grad_norm": 0.18848438560962677, |
| "learning_rate": 9.734321221771003e-06, |
| "loss": 0.7211, |
| "mean_token_accuracy": 0.7825707286596298, |
| "num_tokens": 3878420983.0, |
| "step": 1865 |
| }, |
| { |
| "epoch": 1.924105994340108, |
| "grad_norm": 0.1764371693134308, |
| "learning_rate": 9.69483262602522e-06, |
| "loss": 0.7193, |
| "mean_token_accuracy": 0.7829346388578415, |
| "num_tokens": 3888799338.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.929251350656033, |
| "grad_norm": 0.21297839283943176, |
| "learning_rate": 9.655434236779157e-06, |
| "loss": 0.7255, |
| "mean_token_accuracy": 0.78101367354393, |
| "num_tokens": 3899196340.0, |
| "step": 1875 |
| }, |
| { |
| "epoch": 1.9343967069719579, |
| "grad_norm": 0.16781088709831238, |
| "learning_rate": 9.616127320977103e-06, |
| "loss": 0.7241, |
| "mean_token_accuracy": 0.7818098127841949, |
| "num_tokens": 3909607686.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.9395420632878828, |
| "grad_norm": 0.1716819554567337, |
| "learning_rate": 9.576913142621814e-06, |
| "loss": 0.7226, |
| "mean_token_accuracy": 0.7824906349182129, |
| "num_tokens": 3920018100.0, |
| "step": 1885 |
| }, |
| { |
| "epoch": 1.9446874196038075, |
| "grad_norm": 0.16348664462566376, |
| "learning_rate": 9.537792962733865e-06, |
| "loss": 0.7087, |
| "mean_token_accuracy": 0.7859474241733551, |
| "num_tokens": 3930421105.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.9498327759197325, |
| "grad_norm": 0.17886386811733246, |
| "learning_rate": 9.498768039311091e-06, |
| "loss": 0.7195, |
| "mean_token_accuracy": 0.7828368335962296, |
| "num_tokens": 3940804188.0, |
| "step": 1895 |
| }, |
| { |
| "epoch": 1.9549781322356572, |
| "grad_norm": 0.19559861719608307, |
| "learning_rate": 9.459839627288149e-06, |
| "loss": 0.7223, |
| "mean_token_accuracy": 0.7822702258825303, |
| "num_tokens": 3951164761.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.9601234885515821, |
| "grad_norm": 0.1999935507774353, |
| "learning_rate": 9.421008978496147e-06, |
| "loss": 0.7302, |
| "mean_token_accuracy": 0.7797028571367264, |
| "num_tokens": 3961562769.0, |
| "step": 1905 |
| }, |
| { |
| "epoch": 1.965268844867507, |
| "grad_norm": 0.17995114624500275, |
| "learning_rate": 9.3822773416224e-06, |
| "loss": 0.7242, |
| "mean_token_accuracy": 0.7817402511835099, |
| "num_tokens": 3971972365.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.970414201183432, |
| "grad_norm": 0.19008983671665192, |
| "learning_rate": 9.343645962170267e-06, |
| "loss": 0.701, |
| "mean_token_accuracy": 0.7879698783159256, |
| "num_tokens": 3982366725.0, |
| "step": 1915 |
| }, |
| { |
| "epoch": 1.975559557499357, |
| "grad_norm": 0.1909506916999817, |
| "learning_rate": 9.305116082419098e-06, |
| "loss": 0.7189, |
| "mean_token_accuracy": 0.7828688323497772, |
| "num_tokens": 3992755868.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.9807049138152817, |
| "grad_norm": 0.20575623214244843, |
| "learning_rate": 9.266688941384307e-06, |
| "loss": 0.7144, |
| "mean_token_accuracy": 0.7844961941242218, |
| "num_tokens": 4003176984.0, |
| "step": 1925 |
| }, |
| { |
| "epoch": 1.9858502701312066, |
| "grad_norm": 0.19786439836025238, |
| "learning_rate": 9.228365774777498e-06, |
| "loss": 0.7134, |
| "mean_token_accuracy": 0.7853114068508148, |
| "num_tokens": 4013594555.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.9909956264471314, |
| "grad_norm": 0.19185397028923035, |
| "learning_rate": 9.190147814966747e-06, |
| "loss": 0.717, |
| "mean_token_accuracy": 0.7838179767131805, |
| "num_tokens": 4023992898.0, |
| "step": 1935 |
| }, |
| { |
| "epoch": 1.9961409827630563, |
| "grad_norm": 0.187905415892601, |
| "learning_rate": 9.152036290936966e-06, |
| "loss": 0.7137, |
| "mean_token_accuracy": 0.7846971601247787, |
| "num_tokens": 4034394748.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.001029071263185, |
| "grad_norm": 0.2134164422750473, |
| "learning_rate": 9.114032428250385e-06, |
| "loss": 0.7088, |
| "mean_token_accuracy": 0.7855551619278757, |
| "num_tokens": 4044275069.0, |
| "step": 1945 |
| }, |
| { |
| "epoch": 2.00617442757911, |
| "grad_norm": 0.19707649946212769, |
| "learning_rate": 9.07613744900714e-06, |
| "loss": 0.6946, |
| "mean_token_accuracy": 0.7890868008136749, |
| "num_tokens": 4054664720.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.0113197838950345, |
| "grad_norm": 0.1966494768857956, |
| "learning_rate": 9.038352571805973e-06, |
| "loss": 0.7024, |
| "mean_token_accuracy": 0.7869040161371231, |
| "num_tokens": 4065081909.0, |
| "step": 1955 |
| }, |
| { |
| "epoch": 2.0164651402109595, |
| "grad_norm": 0.21523414552211761, |
| "learning_rate": 9.000679011705048e-06, |
| "loss": 0.7, |
| "mean_token_accuracy": 0.7871535241603851, |
| "num_tokens": 4075473101.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.0216104965268844, |
| "grad_norm": 0.20164090394973755, |
| "learning_rate": 8.963117980182871e-06, |
| "loss": 0.6879, |
| "mean_token_accuracy": 0.7908169955015183, |
| "num_tokens": 4085893536.0, |
| "step": 1965 |
| }, |
| { |
| "epoch": 2.0267558528428093, |
| "grad_norm": 0.1790938675403595, |
| "learning_rate": 8.925670685099344e-06, |
| "loss": 0.6966, |
| "mean_token_accuracy": 0.7879111260175705, |
| "num_tokens": 4096303930.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.0319012091587343, |
| "grad_norm": 0.18033479154109955, |
| "learning_rate": 8.888338330656909e-06, |
| "loss": 0.6907, |
| "mean_token_accuracy": 0.7897056668996811, |
| "num_tokens": 4106711156.0, |
| "step": 1975 |
| }, |
| { |
| "epoch": 2.0370465654746592, |
| "grad_norm": 0.18739460408687592, |
| "learning_rate": 8.851122117361845e-06, |
| "loss": 0.6848, |
| "mean_token_accuracy": 0.7917119234800338, |
| "num_tokens": 4117100496.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.042191921790584, |
| "grad_norm": 0.18272534012794495, |
| "learning_rate": 8.814023241985633e-06, |
| "loss": 0.7014, |
| "mean_token_accuracy": 0.7867262125015259, |
| "num_tokens": 4127509126.0, |
| "step": 1985 |
| }, |
| { |
| "epoch": 2.0473372781065087, |
| "grad_norm": 0.2082863599061966, |
| "learning_rate": 8.777042897526491e-06, |
| "loss": 0.6971, |
| "mean_token_accuracy": 0.7877671688795089, |
| "num_tokens": 4137900742.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.0524826344224336, |
| "grad_norm": 0.18298238515853882, |
| "learning_rate": 8.740182273171021e-06, |
| "loss": 0.6937, |
| "mean_token_accuracy": 0.789008492231369, |
| "num_tokens": 4148307231.0, |
| "step": 1995 |
| }, |
| { |
| "epoch": 2.0576279907383586, |
| "grad_norm": 0.20916491746902466, |
| "learning_rate": 8.703442554255945e-06, |
| "loss": 0.6971, |
| "mean_token_accuracy": 0.7878756642341613, |
| "num_tokens": 4158703927.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.0627733470542835, |
| "grad_norm": 0.17486633360385895, |
| "learning_rate": 8.666824922229993e-06, |
| "loss": 0.6997, |
| "mean_token_accuracy": 0.7873221039772034, |
| "num_tokens": 4169110558.0, |
| "step": 2005 |
| }, |
| { |
| "epoch": 2.0679187033702084, |
| "grad_norm": 0.18294841051101685, |
| "learning_rate": 8.630330554615918e-06, |
| "loss": 0.6957, |
| "mean_token_accuracy": 0.7886905431747436, |
| "num_tokens": 4179479610.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 2.0730640596861334, |
| "grad_norm": 0.17662659287452698, |
| "learning_rate": 8.593960624972635e-06, |
| "loss": 0.6896, |
| "mean_token_accuracy": 0.790263557434082, |
| "num_tokens": 4189878842.0, |
| "step": 2015 |
| }, |
| { |
| "epoch": 2.0782094160020583, |
| "grad_norm": 0.18194252252578735, |
| "learning_rate": 8.557716302857469e-06, |
| "loss": 0.6955, |
| "mean_token_accuracy": 0.7886851370334625, |
| "num_tokens": 4200279229.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 2.083354772317983, |
| "grad_norm": 0.1726061850786209, |
| "learning_rate": 8.521598753788538e-06, |
| "loss": 0.6975, |
| "mean_token_accuracy": 0.7880923539400101, |
| "num_tokens": 4210680215.0, |
| "step": 2025 |
| }, |
| { |
| "epoch": 2.0885001286339078, |
| "grad_norm": 0.17614272236824036, |
| "learning_rate": 8.485609139207312e-06, |
| "loss": 0.6859, |
| "mean_token_accuracy": 0.7908884584903717, |
| "num_tokens": 4221086681.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 2.0936454849498327, |
| "grad_norm": 0.16882190108299255, |
| "learning_rate": 8.449748616441217e-06, |
| "loss": 0.6888, |
| "mean_token_accuracy": 0.7902822762727737, |
| "num_tokens": 4231490233.0, |
| "step": 2035 |
| }, |
| { |
| "epoch": 2.0987908412657577, |
| "grad_norm": 0.18029114603996277, |
| "learning_rate": 8.414018338666453e-06, |
| "loss": 0.6964, |
| "mean_token_accuracy": 0.7881254225969314, |
| "num_tokens": 4241880702.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 2.1039361975816826, |
| "grad_norm": 0.1758222132921219, |
| "learning_rate": 8.378419454870885e-06, |
| "loss": 0.6922, |
| "mean_token_accuracy": 0.7890959054231643, |
| "num_tokens": 4252273338.0, |
| "step": 2045 |
| }, |
| { |
| "epoch": 2.1090815538976075, |
| "grad_norm": 0.18869197368621826, |
| "learning_rate": 8.34295310981712e-06, |
| "loss": 0.6975, |
| "mean_token_accuracy": 0.7877348899841309, |
| "num_tokens": 4262684519.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 2.1142269102135325, |
| "grad_norm": 0.19612114131450653, |
| "learning_rate": 8.307620444005675e-06, |
| "loss": 0.6857, |
| "mean_token_accuracy": 0.7910007119178772, |
| "num_tokens": 4273105678.0, |
| "step": 2055 |
| }, |
| { |
| "epoch": 2.119372266529457, |
| "grad_norm": 0.18810050189495087, |
| "learning_rate": 8.272422593638312e-06, |
| "loss": 0.7012, |
| "mean_token_accuracy": 0.7865576684474945, |
| "num_tokens": 4283510594.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 2.124517622845382, |
| "grad_norm": 0.1900843381881714, |
| "learning_rate": 8.237360690581494e-06, |
| "loss": 0.6946, |
| "mean_token_accuracy": 0.7886899948120117, |
| "num_tokens": 4293904300.0, |
| "step": 2065 |
| }, |
| { |
| "epoch": 2.129662979161307, |
| "grad_norm": 0.20008240640163422, |
| "learning_rate": 8.202435862329992e-06, |
| "loss": 0.6931, |
| "mean_token_accuracy": 0.7892868250608445, |
| "num_tokens": 4304318437.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 2.134808335477232, |
| "grad_norm": 0.19377216696739197, |
| "learning_rate": 8.167649231970629e-06, |
| "loss": 0.7087, |
| "mean_token_accuracy": 0.7848498582839966, |
| "num_tokens": 4314743524.0, |
| "step": 2075 |
| }, |
| { |
| "epoch": 2.1399536917931568, |
| "grad_norm": 0.18533039093017578, |
| "learning_rate": 8.13300191814616e-06, |
| "loss": 0.6985, |
| "mean_token_accuracy": 0.787962692975998, |
| "num_tokens": 4325118311.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 2.1450990481090817, |
| "grad_norm": 0.18524758517742157, |
| "learning_rate": 8.098495035019307e-06, |
| "loss": 0.6933, |
| "mean_token_accuracy": 0.7891044408082962, |
| "num_tokens": 4335531487.0, |
| "step": 2085 |
| }, |
| { |
| "epoch": 2.1502444044250066, |
| "grad_norm": 0.20954075455665588, |
| "learning_rate": 8.064129692236914e-06, |
| "loss": 0.6988, |
| "mean_token_accuracy": 0.7874211251735688, |
| "num_tokens": 4345955448.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 2.155389760740931, |
| "grad_norm": 0.19117045402526855, |
| "learning_rate": 8.029906994894285e-06, |
| "loss": 0.6847, |
| "mean_token_accuracy": 0.7915515124797821, |
| "num_tokens": 4356359783.0, |
| "step": 2095 |
| }, |
| { |
| "epoch": 2.160535117056856, |
| "grad_norm": 0.21733401715755463, |
| "learning_rate": 7.995828043499637e-06, |
| "loss": 0.6933, |
| "mean_token_accuracy": 0.7889263033866882, |
| "num_tokens": 4366739707.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.165680473372781, |
| "grad_norm": 0.18222884833812714, |
| "learning_rate": 7.961893933938707e-06, |
| "loss": 0.7027, |
| "mean_token_accuracy": 0.7866089105606079, |
| "num_tokens": 4377106402.0, |
| "step": 2105 |
| }, |
| { |
| "epoch": 2.170825829688706, |
| "grad_norm": 0.1800689697265625, |
| "learning_rate": 7.92810575743952e-06, |
| "loss": 0.6993, |
| "mean_token_accuracy": 0.7875191777944565, |
| "num_tokens": 4387518880.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 2.175971186004631, |
| "grad_norm": 0.1838780641555786, |
| "learning_rate": 7.89446460053728e-06, |
| "loss": 0.6941, |
| "mean_token_accuracy": 0.7888187408447266, |
| "num_tokens": 4397892379.0, |
| "step": 2115 |
| }, |
| { |
| "epoch": 2.181116542320556, |
| "grad_norm": 0.19722041487693787, |
| "learning_rate": 7.860971545039466e-06, |
| "loss": 0.6971, |
| "mean_token_accuracy": 0.7882630676031113, |
| "num_tokens": 4408268522.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 2.186261898636481, |
| "grad_norm": 0.18653474748134613, |
| "learning_rate": 7.827627667991e-06, |
| "loss": 0.6955, |
| "mean_token_accuracy": 0.787960433959961, |
| "num_tokens": 4418677517.0, |
| "step": 2125 |
| }, |
| { |
| "epoch": 2.1914072549524053, |
| "grad_norm": 0.18152935802936554, |
| "learning_rate": 7.794434041639651e-06, |
| "loss": 0.6969, |
| "mean_token_accuracy": 0.7878915429115295, |
| "num_tokens": 4429109296.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 2.1965526112683302, |
| "grad_norm": 0.19016778469085693, |
| "learning_rate": 7.761391733401523e-06, |
| "loss": 0.6966, |
| "mean_token_accuracy": 0.7879950881004334, |
| "num_tokens": 4439524505.0, |
| "step": 2135 |
| }, |
| { |
| "epoch": 2.201697967584255, |
| "grad_norm": 0.18335750699043274, |
| "learning_rate": 7.728501805826751e-06, |
| "loss": 0.7016, |
| "mean_token_accuracy": 0.786550509929657, |
| "num_tokens": 4449922294.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 2.20684332390018, |
| "grad_norm": 0.17287969589233398, |
| "learning_rate": 7.695765316565326e-06, |
| "loss": 0.6885, |
| "mean_token_accuracy": 0.7902193248271943, |
| "num_tokens": 4460309251.0, |
| "step": 2145 |
| }, |
| { |
| "epoch": 2.211988680216105, |
| "grad_norm": 0.17045357823371887, |
| "learning_rate": 7.66318331833308e-06, |
| "loss": 0.6822, |
| "mean_token_accuracy": 0.7919429570436478, |
| "num_tokens": 4470722415.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 2.21713403653203, |
| "grad_norm": 0.1837993860244751, |
| "learning_rate": 7.630756858877835e-06, |
| "loss": 0.6917, |
| "mean_token_accuracy": 0.7894322812557221, |
| "num_tokens": 4481112230.0, |
| "step": 2155 |
| }, |
| { |
| "epoch": 2.2222793928479545, |
| "grad_norm": 0.1990688294172287, |
| "learning_rate": 7.598486980945721e-06, |
| "loss": 0.696, |
| "mean_token_accuracy": 0.7881556123495101, |
| "num_tokens": 4491526989.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 2.2274247491638794, |
| "grad_norm": 0.19104093313217163, |
| "learning_rate": 7.566374722247625e-06, |
| "loss": 0.7071, |
| "mean_token_accuracy": 0.7854187726974488, |
| "num_tokens": 4501890999.0, |
| "step": 2165 |
| }, |
| { |
| "epoch": 2.2325701054798044, |
| "grad_norm": 0.17589214444160461, |
| "learning_rate": 7.534421115425832e-06, |
| "loss": 0.7082, |
| "mean_token_accuracy": 0.7851406782865524, |
| "num_tokens": 4512263755.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 2.2377154617957293, |
| "grad_norm": 0.18910154700279236, |
| "learning_rate": 7.502627188020828e-06, |
| "loss": 0.7018, |
| "mean_token_accuracy": 0.7865214943885803, |
| "num_tokens": 4522673409.0, |
| "step": 2175 |
| }, |
| { |
| "epoch": 2.2428608181116543, |
| "grad_norm": 0.19419220089912415, |
| "learning_rate": 7.470993962438233e-06, |
| "loss": 0.6981, |
| "mean_token_accuracy": 0.787706145644188, |
| "num_tokens": 4533050765.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 2.248006174427579, |
| "grad_norm": 0.19314493238925934, |
| "learning_rate": 7.439522455915941e-06, |
| "loss": 0.6921, |
| "mean_token_accuracy": 0.7892083436250686, |
| "num_tokens": 4543452165.0, |
| "step": 2185 |
| }, |
| { |
| "epoch": 2.253151530743504, |
| "grad_norm": 0.21107150614261627, |
| "learning_rate": 7.408213680491409e-06, |
| "loss": 0.6969, |
| "mean_token_accuracy": 0.7882888942956925, |
| "num_tokens": 4553853739.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 2.258296887059429, |
| "grad_norm": 0.1907646358013153, |
| "learning_rate": 7.377068642969104e-06, |
| "loss": 0.6963, |
| "mean_token_accuracy": 0.7884598582983017, |
| "num_tokens": 4564274917.0, |
| "step": 2195 |
| }, |
| { |
| "epoch": 2.2634422433753536, |
| "grad_norm": 0.17553763091564178, |
| "learning_rate": 7.346088344888125e-06, |
| "loss": 0.6951, |
| "mean_token_accuracy": 0.788986611366272, |
| "num_tokens": 4574641137.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.2685875996912785, |
| "grad_norm": 0.18935340642929077, |
| "learning_rate": 7.315273782490008e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.7872116446495057, |
| "num_tokens": 4585019689.0, |
| "step": 2205 |
| }, |
| { |
| "epoch": 2.2737329560072035, |
| "grad_norm": 0.19580236077308655, |
| "learning_rate": 7.284625946686685e-06, |
| "loss": 0.693, |
| "mean_token_accuracy": 0.7892390996217727, |
| "num_tokens": 4595438984.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 2.2788783123231284, |
| "grad_norm": 0.19986537098884583, |
| "learning_rate": 7.254145823028617e-06, |
| "loss": 0.6936, |
| "mean_token_accuracy": 0.7890995383262634, |
| "num_tokens": 4605823855.0, |
| "step": 2215 |
| }, |
| { |
| "epoch": 2.2840236686390534, |
| "grad_norm": 0.1742008775472641, |
| "learning_rate": 7.2238343916730915e-06, |
| "loss": 0.6993, |
| "mean_token_accuracy": 0.7871985048055649, |
| "num_tokens": 4616238871.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 2.2891690249549783, |
| "grad_norm": 0.19267351925373077, |
| "learning_rate": 7.193692627352726e-06, |
| "loss": 0.6872, |
| "mean_token_accuracy": 0.7908998429775238, |
| "num_tokens": 4626643459.0, |
| "step": 2225 |
| }, |
| { |
| "epoch": 2.294314381270903, |
| "grad_norm": 0.17010417580604553, |
| "learning_rate": 7.163721499344107e-06, |
| "loss": 0.6966, |
| "mean_token_accuracy": 0.7883331865072251, |
| "num_tokens": 4637044833.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 2.2994597375868278, |
| "grad_norm": 0.17460167407989502, |
| "learning_rate": 7.133921971436622e-06, |
| "loss": 0.6989, |
| "mean_token_accuracy": 0.7876080513000489, |
| "num_tokens": 4647444102.0, |
| "step": 2235 |
| }, |
| { |
| "epoch": 2.3046050939027527, |
| "grad_norm": 0.17576715350151062, |
| "learning_rate": 7.104295001901473e-06, |
| "loss": 0.6878, |
| "mean_token_accuracy": 0.7904452890157699, |
| "num_tokens": 4657824599.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 2.3097504502186776, |
| "grad_norm": 0.16763444244861603, |
| "learning_rate": 7.074841543460853e-06, |
| "loss": 0.6844, |
| "mean_token_accuracy": 0.7918480813503266, |
| "num_tokens": 4668181094.0, |
| "step": 2245 |
| }, |
| { |
| "epoch": 2.3148958065346026, |
| "grad_norm": 0.1823350489139557, |
| "learning_rate": 7.0455625432573186e-06, |
| "loss": 0.6932, |
| "mean_token_accuracy": 0.7892414182424545, |
| "num_tokens": 4678584523.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 2.3200411628505275, |
| "grad_norm": 0.17247016727924347, |
| "learning_rate": 7.016458942823321e-06, |
| "loss": 0.6869, |
| "mean_token_accuracy": 0.7909802347421646, |
| "num_tokens": 4688981096.0, |
| "step": 2255 |
| }, |
| { |
| "epoch": 2.3251865191664525, |
| "grad_norm": 0.17673242092132568, |
| "learning_rate": 6.987531678050943e-06, |
| "loss": 0.6802, |
| "mean_token_accuracy": 0.7923660695552825, |
| "num_tokens": 4699404625.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 2.330331875482377, |
| "grad_norm": 0.16854751110076904, |
| "learning_rate": 6.958781679161788e-06, |
| "loss": 0.6842, |
| "mean_token_accuracy": 0.7919197797775268, |
| "num_tokens": 4709811697.0, |
| "step": 2265 |
| }, |
| { |
| "epoch": 2.335477231798302, |
| "grad_norm": 0.1761576384305954, |
| "learning_rate": 6.930209870677077e-06, |
| "loss": 0.685, |
| "mean_token_accuracy": 0.7914377897977829, |
| "num_tokens": 4720237781.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 2.340622588114227, |
| "grad_norm": 0.17306455969810486, |
| "learning_rate": 6.901817171387917e-06, |
| "loss": 0.7019, |
| "mean_token_accuracy": 0.7869494408369064, |
| "num_tokens": 4730606260.0, |
| "step": 2275 |
| }, |
| { |
| "epoch": 2.345767944430152, |
| "grad_norm": 0.18955180048942566, |
| "learning_rate": 6.873604494325757e-06, |
| "loss": 0.6948, |
| "mean_token_accuracy": 0.7886533975601197, |
| "num_tokens": 4741014261.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 2.3509133007460767, |
| "grad_norm": 0.1918182373046875, |
| "learning_rate": 6.845572746733015e-06, |
| "loss": 0.6907, |
| "mean_token_accuracy": 0.7898207098245621, |
| "num_tokens": 4751422939.0, |
| "step": 2285 |
| }, |
| { |
| "epoch": 2.3560586570620017, |
| "grad_norm": 0.17227977514266968, |
| "learning_rate": 6.8177228300339186e-06, |
| "loss": 0.6926, |
| "mean_token_accuracy": 0.7893718838691711, |
| "num_tokens": 4761799091.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 2.361204013377926, |
| "grad_norm": 0.19379264116287231, |
| "learning_rate": 6.79005563980551e-06, |
| "loss": 0.6867, |
| "mean_token_accuracy": 0.7909263670444489, |
| "num_tokens": 4772203894.0, |
| "step": 2295 |
| }, |
| { |
| "epoch": 2.366349369693851, |
| "grad_norm": 0.16864101588726044, |
| "learning_rate": 6.7625720657488526e-06, |
| "loss": 0.6954, |
| "mean_token_accuracy": 0.7885312736034393, |
| "num_tokens": 4782600873.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.371494726009776, |
| "grad_norm": 0.17416614294052124, |
| "learning_rate": 6.735272991660415e-06, |
| "loss": 0.7108, |
| "mean_token_accuracy": 0.784038883447647, |
| "num_tokens": 4793015981.0, |
| "step": 2305 |
| }, |
| { |
| "epoch": 2.376640082325701, |
| "grad_norm": 0.17601124942302704, |
| "learning_rate": 6.708159295403645e-06, |
| "loss": 0.6931, |
| "mean_token_accuracy": 0.7890658885240555, |
| "num_tokens": 4803428229.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 2.381785438641626, |
| "grad_norm": 0.17575791478157043, |
| "learning_rate": 6.681231848880758e-06, |
| "loss": 0.6931, |
| "mean_token_accuracy": 0.7891870647668838, |
| "num_tokens": 4813832709.0, |
| "step": 2315 |
| }, |
| { |
| "epoch": 2.386930794957551, |
| "grad_norm": 0.171165332198143, |
| "learning_rate": 6.654491518004684e-06, |
| "loss": 0.6977, |
| "mean_token_accuracy": 0.7878083676099777, |
| "num_tokens": 4824217333.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 2.392076151273476, |
| "grad_norm": 0.174204021692276, |
| "learning_rate": 6.6279391626712195e-06, |
| "loss": 0.6767, |
| "mean_token_accuracy": 0.7939162909984588, |
| "num_tokens": 4834578249.0, |
| "step": 2325 |
| }, |
| { |
| "epoch": 2.397221507589401, |
| "grad_norm": 0.17255154252052307, |
| "learning_rate": 6.601575636731393e-06, |
| "loss": 0.6853, |
| "mean_token_accuracy": 0.7911572694778443, |
| "num_tokens": 4845003859.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 2.4023668639053253, |
| "grad_norm": 0.17027340829372406, |
| "learning_rate": 6.575401787963991e-06, |
| "loss": 0.7016, |
| "mean_token_accuracy": 0.7866772085428237, |
| "num_tokens": 4855396377.0, |
| "step": 2335 |
| }, |
| { |
| "epoch": 2.40751222022125, |
| "grad_norm": 0.1804320216178894, |
| "learning_rate": 6.549418458048301e-06, |
| "loss": 0.6944, |
| "mean_token_accuracy": 0.7887315809726715, |
| "num_tokens": 4865807607.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 2.412657576537175, |
| "grad_norm": 0.17891834676265717, |
| "learning_rate": 6.523626482537051e-06, |
| "loss": 0.6924, |
| "mean_token_accuracy": 0.7891820967197418, |
| "num_tokens": 4876203853.0, |
| "step": 2345 |
| }, |
| { |
| "epoch": 2.4178029328531, |
| "grad_norm": 0.17687129974365234, |
| "learning_rate": 6.498026690829529e-06, |
| "loss": 0.6879, |
| "mean_token_accuracy": 0.7905671745538712, |
| "num_tokens": 4886593797.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 2.422948289169025, |
| "grad_norm": 0.16701921820640564, |
| "learning_rate": 6.472619906144924e-06, |
| "loss": 0.7011, |
| "mean_token_accuracy": 0.7867477118968964, |
| "num_tokens": 4897020034.0, |
| "step": 2355 |
| }, |
| { |
| "epoch": 2.42809364548495, |
| "grad_norm": 0.18092454969882965, |
| "learning_rate": 6.447406945495843e-06, |
| "loss": 0.6846, |
| "mean_token_accuracy": 0.7916429519653321, |
| "num_tokens": 4907433880.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 2.4332390018008745, |
| "grad_norm": 0.17751817405223846, |
| "learning_rate": 6.422388619662045e-06, |
| "loss": 0.694, |
| "mean_token_accuracy": 0.7888200342655182, |
| "num_tokens": 4917840148.0, |
| "step": 2365 |
| }, |
| { |
| "epoch": 2.4383843581167994, |
| "grad_norm": 0.1927623152732849, |
| "learning_rate": 6.3975657331643715e-06, |
| "loss": 0.6959, |
| "mean_token_accuracy": 0.7883234590291976, |
| "num_tokens": 4928232237.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 2.4435297144327244, |
| "grad_norm": 0.17448590695858002, |
| "learning_rate": 6.3729390842388585e-06, |
| "loss": 0.7, |
| "mean_token_accuracy": 0.7875474035739899, |
| "num_tokens": 4938631938.0, |
| "step": 2375 |
| }, |
| { |
| "epoch": 2.4486750707486493, |
| "grad_norm": 0.1870429664850235, |
| "learning_rate": 6.348509464811088e-06, |
| "loss": 0.698, |
| "mean_token_accuracy": 0.7877880901098251, |
| "num_tokens": 4949047787.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 2.4538204270645743, |
| "grad_norm": 0.1780596822500229, |
| "learning_rate": 6.3242776604707144e-06, |
| "loss": 0.6918, |
| "mean_token_accuracy": 0.7893176406621933, |
| "num_tokens": 4959424736.0, |
| "step": 2385 |
| }, |
| { |
| "epoch": 2.458965783380499, |
| "grad_norm": 0.16919124126434326, |
| "learning_rate": 6.300244450446195e-06, |
| "loss": 0.7012, |
| "mean_token_accuracy": 0.7870047926902771, |
| "num_tokens": 4969829870.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 2.464111139696424, |
| "grad_norm": 0.172256737947464, |
| "learning_rate": 6.27641060757974e-06, |
| "loss": 0.7041, |
| "mean_token_accuracy": 0.7858896970748901, |
| "num_tokens": 4980197224.0, |
| "step": 2395 |
| }, |
| { |
| "epoch": 2.469256496012349, |
| "grad_norm": 0.17399129271507263, |
| "learning_rate": 6.252776898302453e-06, |
| "loss": 0.6824, |
| "mean_token_accuracy": 0.7921358823776246, |
| "num_tokens": 4990600805.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.4744018523282736, |
| "grad_norm": 0.1758703887462616, |
| "learning_rate": 6.2293440826097005e-06, |
| "loss": 0.6961, |
| "mean_token_accuracy": 0.7880290925502778, |
| "num_tokens": 5000978207.0, |
| "step": 2405 |
| }, |
| { |
| "epoch": 2.4795472086441985, |
| "grad_norm": 0.1842864602804184, |
| "learning_rate": 6.206112914036657e-06, |
| "loss": 0.6965, |
| "mean_token_accuracy": 0.7884801357984543, |
| "num_tokens": 5011384736.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 2.4846925649601235, |
| "grad_norm": 0.1803123950958252, |
| "learning_rate": 6.1830841396340705e-06, |
| "loss": 0.6991, |
| "mean_token_accuracy": 0.7872378647327423, |
| "num_tokens": 5021771302.0, |
| "step": 2415 |
| }, |
| { |
| "epoch": 2.4898379212760484, |
| "grad_norm": 0.17343245446681976, |
| "learning_rate": 6.160258499944255e-06, |
| "loss": 0.6899, |
| "mean_token_accuracy": 0.7900263160467148, |
| "num_tokens": 5032158385.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 2.4949832775919734, |
| "grad_norm": 0.18295209109783173, |
| "learning_rate": 6.137636728977267e-06, |
| "loss": 0.6873, |
| "mean_token_accuracy": 0.7904597342014312, |
| "num_tokens": 5042580817.0, |
| "step": 2425 |
| }, |
| { |
| "epoch": 2.500128633907898, |
| "grad_norm": 0.1744978278875351, |
| "learning_rate": 6.115219554187303e-06, |
| "loss": 0.6944, |
| "mean_token_accuracy": 0.7883099675178528, |
| "num_tokens": 5052996785.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 2.505273990223823, |
| "grad_norm": 0.17044760286808014, |
| "learning_rate": 6.0930076964493034e-06, |
| "loss": 0.7044, |
| "mean_token_accuracy": 0.7858549505472183, |
| "num_tokens": 5063403777.0, |
| "step": 2435 |
| }, |
| { |
| "epoch": 2.5104193465397477, |
| "grad_norm": 0.1745147705078125, |
| "learning_rate": 6.07100187003578e-06, |
| "loss": 0.6946, |
| "mean_token_accuracy": 0.7888891041278839, |
| "num_tokens": 5073787624.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 2.5155647028556727, |
| "grad_norm": 0.17163583636283875, |
| "learning_rate": 6.049202782593837e-06, |
| "loss": 0.7091, |
| "mean_token_accuracy": 0.784814390540123, |
| "num_tokens": 5084155762.0, |
| "step": 2445 |
| }, |
| { |
| "epoch": 2.5207100591715976, |
| "grad_norm": 0.1630815863609314, |
| "learning_rate": 6.027611135122423e-06, |
| "loss": 0.6833, |
| "mean_token_accuracy": 0.7919480204582214, |
| "num_tokens": 5094520579.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 2.5258554154875226, |
| "grad_norm": 0.169570654630661, |
| "learning_rate": 6.006227621949783e-06, |
| "loss": 0.6912, |
| "mean_token_accuracy": 0.7897911489009857, |
| "num_tokens": 5104935332.0, |
| "step": 2455 |
| }, |
| { |
| "epoch": 2.5310007718034475, |
| "grad_norm": 0.17307248711585999, |
| "learning_rate": 5.985052930711133e-06, |
| "loss": 0.686, |
| "mean_token_accuracy": 0.7910365283489227, |
| "num_tokens": 5115312123.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 2.5361461281193725, |
| "grad_norm": 0.1717165857553482, |
| "learning_rate": 5.964087742326549e-06, |
| "loss": 0.7048, |
| "mean_token_accuracy": 0.7863658338785171, |
| "num_tokens": 5125722883.0, |
| "step": 2465 |
| }, |
| { |
| "epoch": 2.5412914844352974, |
| "grad_norm": 0.16661353409290314, |
| "learning_rate": 5.943332730979067e-06, |
| "loss": 0.6982, |
| "mean_token_accuracy": 0.7878574222326279, |
| "num_tokens": 5136118397.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 2.546436840751222, |
| "grad_norm": 0.18003134429454803, |
| "learning_rate": 5.922788564093009e-06, |
| "loss": 0.6942, |
| "mean_token_accuracy": 0.788626492023468, |
| "num_tokens": 5146490125.0, |
| "step": 2475 |
| }, |
| { |
| "epoch": 2.551582197067147, |
| "grad_norm": 0.17579644918441772, |
| "learning_rate": 5.902455902312511e-06, |
| "loss": 0.7027, |
| "mean_token_accuracy": 0.7862021327018738, |
| "num_tokens": 5156884568.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 2.556727553383072, |
| "grad_norm": 0.1727745682001114, |
| "learning_rate": 5.88233539948029e-06, |
| "loss": 0.6925, |
| "mean_token_accuracy": 0.7890074193477631, |
| "num_tokens": 5167297476.0, |
| "step": 2485 |
| }, |
| { |
| "epoch": 2.5618729096989967, |
| "grad_norm": 0.18698406219482422, |
| "learning_rate": 5.862427702616605e-06, |
| "loss": 0.6831, |
| "mean_token_accuracy": 0.7916372120380402, |
| "num_tokens": 5177725329.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 2.5670182660149217, |
| "grad_norm": 0.17920783162117004, |
| "learning_rate": 5.842733451898467e-06, |
| "loss": 0.7028, |
| "mean_token_accuracy": 0.7861807703971863, |
| "num_tokens": 5188136857.0, |
| "step": 2495 |
| }, |
| { |
| "epoch": 2.572163622330846, |
| "grad_norm": 0.17899206280708313, |
| "learning_rate": 5.823253280639039e-06, |
| "loss": 0.6814, |
| "mean_token_accuracy": 0.7923789769411087, |
| "num_tokens": 5198537170.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.577308978646771, |
| "grad_norm": 0.17629799246788025, |
| "learning_rate": 5.803987815267268e-06, |
| "loss": 0.6979, |
| "mean_token_accuracy": 0.7875349700450898, |
| "num_tokens": 5208920419.0, |
| "step": 2505 |
| }, |
| { |
| "epoch": 2.582454334962696, |
| "grad_norm": 0.16480837762355804, |
| "learning_rate": 5.7849376753077625e-06, |
| "loss": 0.6856, |
| "mean_token_accuracy": 0.7911129057407379, |
| "num_tokens": 5219341909.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 2.587599691278621, |
| "grad_norm": 0.1704104244709015, |
| "learning_rate": 5.766103473360842e-06, |
| "loss": 0.6955, |
| "mean_token_accuracy": 0.7883382886648178, |
| "num_tokens": 5229692388.0, |
| "step": 2515 |
| }, |
| { |
| "epoch": 2.592745047594546, |
| "grad_norm": 0.17449352145195007, |
| "learning_rate": 5.74748581508286e-06, |
| "loss": 0.6943, |
| "mean_token_accuracy": 0.788813516497612, |
| "num_tokens": 5240057257.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 2.597890403910471, |
| "grad_norm": 0.17114900052547455, |
| "learning_rate": 5.729085299166713e-06, |
| "loss": 0.6925, |
| "mean_token_accuracy": 0.7888531744480133, |
| "num_tokens": 5250472272.0, |
| "step": 2525 |
| }, |
| { |
| "epoch": 2.603035760226396, |
| "grad_norm": 0.19473806023597717, |
| "learning_rate": 5.710902517322597e-06, |
| "loss": 0.7034, |
| "mean_token_accuracy": 0.7864585638046264, |
| "num_tokens": 5260858183.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 2.6081811165423208, |
| "grad_norm": 0.17495019733905792, |
| "learning_rate": 5.6929380542589764e-06, |
| "loss": 0.6919, |
| "mean_token_accuracy": 0.7893568813800812, |
| "num_tokens": 5271278572.0, |
| "step": 2535 |
| }, |
| { |
| "epoch": 2.6133264728582457, |
| "grad_norm": 0.16885867714881897, |
| "learning_rate": 5.675192487663777e-06, |
| "loss": 0.6922, |
| "mean_token_accuracy": 0.7892817795276642, |
| "num_tokens": 5281676141.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 2.61847182917417, |
| "grad_norm": 0.17458729445934296, |
| "learning_rate": 5.657666388185823e-06, |
| "loss": 0.6925, |
| "mean_token_accuracy": 0.7891670197248459, |
| "num_tokens": 5292081395.0, |
| "step": 2545 |
| }, |
| { |
| "epoch": 2.623617185490095, |
| "grad_norm": 0.17178680002689362, |
| "learning_rate": 5.640360319416467e-06, |
| "loss": 0.6888, |
| "mean_token_accuracy": 0.7902668923139572, |
| "num_tokens": 5302469310.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 2.62876254180602, |
| "grad_norm": 0.1776588261127472, |
| "learning_rate": 5.623274837871483e-06, |
| "loss": 0.694, |
| "mean_token_accuracy": 0.7890095263719559, |
| "num_tokens": 5312857824.0, |
| "step": 2555 |
| }, |
| { |
| "epoch": 2.633907898121945, |
| "grad_norm": 0.17728105187416077, |
| "learning_rate": 5.606410492973162e-06, |
| "loss": 0.6885, |
| "mean_token_accuracy": 0.7901356816291809, |
| "num_tokens": 5323248156.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 2.63905325443787, |
| "grad_norm": 0.16960322856903076, |
| "learning_rate": 5.589767827032649e-06, |
| "loss": 0.7014, |
| "mean_token_accuracy": 0.7869833618402481, |
| "num_tokens": 5333628919.0, |
| "step": 2565 |
| }, |
| { |
| "epoch": 2.6441986107537945, |
| "grad_norm": 0.17245355248451233, |
| "learning_rate": 5.573347375232493e-06, |
| "loss": 0.6918, |
| "mean_token_accuracy": 0.7889738440513611, |
| "num_tokens": 5344021897.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 2.6493439670697194, |
| "grad_norm": 0.16664022207260132, |
| "learning_rate": 5.557149665609455e-06, |
| "loss": 0.7, |
| "mean_token_accuracy": 0.7870820313692093, |
| "num_tokens": 5354420664.0, |
| "step": 2575 |
| }, |
| { |
| "epoch": 2.6544893233856444, |
| "grad_norm": 0.17978526651859283, |
| "learning_rate": 5.54117521903751e-06, |
| "loss": 0.6783, |
| "mean_token_accuracy": 0.793021947145462, |
| "num_tokens": 5364835797.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 2.6596346797015693, |
| "grad_norm": 0.17351332306861877, |
| "learning_rate": 5.525424549211112e-06, |
| "loss": 0.6964, |
| "mean_token_accuracy": 0.7881974041461944, |
| "num_tokens": 5375251218.0, |
| "step": 2585 |
| }, |
| { |
| "epoch": 2.6647800360174942, |
| "grad_norm": 0.1651052087545395, |
| "learning_rate": 5.509898162628657e-06, |
| "loss": 0.6956, |
| "mean_token_accuracy": 0.7883888274431229, |
| "num_tokens": 5385653462.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 2.669925392333419, |
| "grad_norm": 0.17517080903053284, |
| "learning_rate": 5.494596558576215e-06, |
| "loss": 0.7, |
| "mean_token_accuracy": 0.7868314325809479, |
| "num_tokens": 5396055523.0, |
| "step": 2595 |
| }, |
| { |
| "epoch": 2.675070748649344, |
| "grad_norm": 0.18584103882312775, |
| "learning_rate": 5.4795202291114655e-06, |
| "loss": 0.6948, |
| "mean_token_accuracy": 0.7886532038450241, |
| "num_tokens": 5406483427.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.680216104965269, |
| "grad_norm": 0.1600140780210495, |
| "learning_rate": 5.464669659047871e-06, |
| "loss": 0.7105, |
| "mean_token_accuracy": 0.7844233065843582, |
| "num_tokens": 5416893350.0, |
| "step": 2605 |
| }, |
| { |
| "epoch": 2.6853614612811936, |
| "grad_norm": 0.16962645947933197, |
| "learning_rate": 5.450045325939086e-06, |
| "loss": 0.699, |
| "mean_token_accuracy": 0.7872962862253189, |
| "num_tokens": 5427314991.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 2.6905068175971185, |
| "grad_norm": 0.16842614114284515, |
| "learning_rate": 5.4356477000636155e-06, |
| "loss": 0.696, |
| "mean_token_accuracy": 0.7881788671016693, |
| "num_tokens": 5437722441.0, |
| "step": 2615 |
| }, |
| { |
| "epoch": 2.6956521739130435, |
| "grad_norm": 0.1736549735069275, |
| "learning_rate": 5.42147724440967e-06, |
| "loss": 0.6804, |
| "mean_token_accuracy": 0.7926200598478317, |
| "num_tokens": 5448128609.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 2.7007975302289684, |
| "grad_norm": 0.17829610407352448, |
| "learning_rate": 5.407534414660296e-06, |
| "loss": 0.7076, |
| "mean_token_accuracy": 0.7849996328353882, |
| "num_tokens": 5458552679.0, |
| "step": 2625 |
| }, |
| { |
| "epoch": 2.7059428865448933, |
| "grad_norm": 0.17205742001533508, |
| "learning_rate": 5.3938196591787055e-06, |
| "loss": 0.6886, |
| "mean_token_accuracy": 0.7903157830238342, |
| "num_tokens": 5468959670.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 2.711088242860818, |
| "grad_norm": 0.19865980744361877, |
| "learning_rate": 5.380333418993874e-06, |
| "loss": 0.6969, |
| "mean_token_accuracy": 0.7879400312900543, |
| "num_tokens": 5479359994.0, |
| "step": 2635 |
| }, |
| { |
| "epoch": 2.716233599176743, |
| "grad_norm": 0.17879824340343475, |
| "learning_rate": 5.367076127786349e-06, |
| "loss": 0.6799, |
| "mean_token_accuracy": 0.7927328020334243, |
| "num_tokens": 5489776454.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 2.7213789554926677, |
| "grad_norm": 0.16733145713806152, |
| "learning_rate": 5.354048211874305e-06, |
| "loss": 0.7009, |
| "mean_token_accuracy": 0.7868389576673508, |
| "num_tokens": 5500195720.0, |
| "step": 2645 |
| }, |
| { |
| "epoch": 2.7265243118085927, |
| "grad_norm": 0.17073538899421692, |
| "learning_rate": 5.341250090199836e-06, |
| "loss": 0.689, |
| "mean_token_accuracy": 0.7902264356613159, |
| "num_tokens": 5510590496.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 2.7316696681245176, |
| "grad_norm": 0.16798222064971924, |
| "learning_rate": 5.328682174315484e-06, |
| "loss": 0.6997, |
| "mean_token_accuracy": 0.7871428996324539, |
| "num_tokens": 5520984194.0, |
| "step": 2655 |
| }, |
| { |
| "epoch": 2.7368150244404426, |
| "grad_norm": 0.17511659860610962, |
| "learning_rate": 5.316344868370999e-06, |
| "loss": 0.7027, |
| "mean_token_accuracy": 0.7862639844417572, |
| "num_tokens": 5531378820.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 2.7419603807563675, |
| "grad_norm": 0.1656564474105835, |
| "learning_rate": 5.304238569100351e-06, |
| "loss": 0.6919, |
| "mean_token_accuracy": 0.7892627060413361, |
| "num_tokens": 5541788819.0, |
| "step": 2665 |
| }, |
| { |
| "epoch": 2.7471057370722924, |
| "grad_norm": 0.1768578141927719, |
| "learning_rate": 5.2923636658089674e-06, |
| "loss": 0.6951, |
| "mean_token_accuracy": 0.7887540727853775, |
| "num_tokens": 5552136282.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 2.7522510933882174, |
| "grad_norm": 0.16567444801330566, |
| "learning_rate": 5.280720540361213e-06, |
| "loss": 0.6902, |
| "mean_token_accuracy": 0.7896697282791137, |
| "num_tokens": 5562532385.0, |
| "step": 2675 |
| }, |
| { |
| "epoch": 2.757396449704142, |
| "grad_norm": 0.16643787920475006, |
| "learning_rate": 5.2693095671681125e-06, |
| "loss": 0.6946, |
| "mean_token_accuracy": 0.7891028523445129, |
| "num_tokens": 5572953965.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 2.762541806020067, |
| "grad_norm": 0.1679885983467102, |
| "learning_rate": 5.258131113175312e-06, |
| "loss": 0.6928, |
| "mean_token_accuracy": 0.7889256983995437, |
| "num_tokens": 5583365916.0, |
| "step": 2685 |
| }, |
| { |
| "epoch": 2.7676871623359918, |
| "grad_norm": 0.17079950869083405, |
| "learning_rate": 5.247185537851277e-06, |
| "loss": 0.693, |
| "mean_token_accuracy": 0.7887627691030502, |
| "num_tokens": 5593766520.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 2.7728325186519167, |
| "grad_norm": 0.17430275678634644, |
| "learning_rate": 5.236473193175727e-06, |
| "loss": 0.693, |
| "mean_token_accuracy": 0.7892514944076539, |
| "num_tokens": 5604146645.0, |
| "step": 2695 |
| }, |
| { |
| "epoch": 2.7779778749678417, |
| "grad_norm": 0.1720867156982422, |
| "learning_rate": 5.225994423628329e-06, |
| "loss": 0.6982, |
| "mean_token_accuracy": 0.7877674490213394, |
| "num_tokens": 5614554383.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.783123231283766, |
| "grad_norm": 0.17518489062786102, |
| "learning_rate": 5.215749566177612e-06, |
| "loss": 0.6908, |
| "mean_token_accuracy": 0.789122948050499, |
| "num_tokens": 5624946901.0, |
| "step": 2705 |
| }, |
| { |
| "epoch": 2.788268587599691, |
| "grad_norm": 0.17626863718032837, |
| "learning_rate": 5.2057389502701315e-06, |
| "loss": 0.6962, |
| "mean_token_accuracy": 0.7886440306901932, |
| "num_tokens": 5635349602.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 2.793413943915616, |
| "grad_norm": 0.17560714483261108, |
| "learning_rate": 5.19596289781988e-06, |
| "loss": 0.6859, |
| "mean_token_accuracy": 0.7909172236919403, |
| "num_tokens": 5645756384.0, |
| "step": 2715 |
| }, |
| { |
| "epoch": 2.798559300231541, |
| "grad_norm": 0.16886913776397705, |
| "learning_rate": 5.186421723197922e-06, |
| "loss": 0.6868, |
| "mean_token_accuracy": 0.7907733172178268, |
| "num_tokens": 5656178271.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 2.803704656547466, |
| "grad_norm": 0.1693251132965088, |
| "learning_rate": 5.177115733222307e-06, |
| "loss": 0.6947, |
| "mean_token_accuracy": 0.7886239379644394, |
| "num_tokens": 5666600795.0, |
| "step": 2725 |
| }, |
| { |
| "epoch": 2.808850012863391, |
| "grad_norm": 0.16803273558616638, |
| "learning_rate": 5.168045227148184e-06, |
| "loss": 0.6972, |
| "mean_token_accuracy": 0.787845715880394, |
| "num_tokens": 5676999671.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 2.813995369179316, |
| "grad_norm": 0.16651484370231628, |
| "learning_rate": 5.159210496658182e-06, |
| "loss": 0.6884, |
| "mean_token_accuracy": 0.7903055369853973, |
| "num_tokens": 5687397402.0, |
| "step": 2735 |
| }, |
| { |
| "epoch": 2.8191407254952408, |
| "grad_norm": 0.1660391241312027, |
| "learning_rate": 5.15061182585304e-06, |
| "loss": 0.6792, |
| "mean_token_accuracy": 0.7931360393762589, |
| "num_tokens": 5697794080.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 2.8242860818111657, |
| "grad_norm": 0.17520754039287567, |
| "learning_rate": 5.1422494912424595e-06, |
| "loss": 0.6986, |
| "mean_token_accuracy": 0.7876220345497131, |
| "num_tokens": 5708182662.0, |
| "step": 2745 |
| }, |
| { |
| "epoch": 2.82943143812709, |
| "grad_norm": 0.16872966289520264, |
| "learning_rate": 5.134123761736216e-06, |
| "loss": 0.6966, |
| "mean_token_accuracy": 0.7881220698356628, |
| "num_tokens": 5718594864.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 2.834576794443015, |
| "grad_norm": 0.16700060665607452, |
| "learning_rate": 5.126234898635518e-06, |
| "loss": 0.6942, |
| "mean_token_accuracy": 0.788716048002243, |
| "num_tokens": 5728995814.0, |
| "step": 2755 |
| }, |
| { |
| "epoch": 2.83972215075894, |
| "grad_norm": 0.17934595048427582, |
| "learning_rate": 5.118583155624593e-06, |
| "loss": 0.6929, |
| "mean_token_accuracy": 0.7886988967657089, |
| "num_tokens": 5739395668.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 2.844867507074865, |
| "grad_norm": 0.17512430250644684, |
| "learning_rate": 5.111168778762542e-06, |
| "loss": 0.6938, |
| "mean_token_accuracy": 0.7889688044786454, |
| "num_tokens": 5749807349.0, |
| "step": 2765 |
| }, |
| { |
| "epoch": 2.85001286339079, |
| "grad_norm": 0.1730797290802002, |
| "learning_rate": 5.103992006475416e-06, |
| "loss": 0.6951, |
| "mean_token_accuracy": 0.7884382456541061, |
| "num_tokens": 5760209986.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 2.8551582197067145, |
| "grad_norm": 0.17350102961063385, |
| "learning_rate": 5.097053069548554e-06, |
| "loss": 0.6985, |
| "mean_token_accuracy": 0.7875759869813919, |
| "num_tokens": 5770624277.0, |
| "step": 2775 |
| }, |
| { |
| "epoch": 2.8603035760226394, |
| "grad_norm": 0.16581964492797852, |
| "learning_rate": 5.090352191119167e-06, |
| "loss": 0.6968, |
| "mean_token_accuracy": 0.7881747186183929, |
| "num_tokens": 5780998524.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 2.8654489323385643, |
| "grad_norm": 0.1817171722650528, |
| "learning_rate": 5.083889586669148e-06, |
| "loss": 0.6957, |
| "mean_token_accuracy": 0.7882327765226365, |
| "num_tokens": 5791408666.0, |
| "step": 2785 |
| }, |
| { |
| "epoch": 2.8705942886544893, |
| "grad_norm": 0.18993426859378815, |
| "learning_rate": 5.077665464018158e-06, |
| "loss": 0.7035, |
| "mean_token_accuracy": 0.7860898345708847, |
| "num_tokens": 5801820214.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 2.8757396449704142, |
| "grad_norm": 0.21299272775650024, |
| "learning_rate": 5.071680023316934e-06, |
| "loss": 0.688, |
| "mean_token_accuracy": 0.7904294729232788, |
| "num_tokens": 5812235390.0, |
| "step": 2795 |
| }, |
| { |
| "epoch": 2.880885001286339, |
| "grad_norm": 0.17882496118545532, |
| "learning_rate": 5.065933457040855e-06, |
| "loss": 0.6996, |
| "mean_token_accuracy": 0.7872415781021118, |
| "num_tokens": 5822639708.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.886030357602264, |
| "grad_norm": 0.16855992376804352, |
| "learning_rate": 5.060425949983754e-06, |
| "loss": 0.682, |
| "mean_token_accuracy": 0.7919277369976043, |
| "num_tokens": 5833034354.0, |
| "step": 2805 |
| }, |
| { |
| "epoch": 2.891175713918189, |
| "grad_norm": 0.17891888320446014, |
| "learning_rate": 5.055157679251973e-06, |
| "loss": 0.6899, |
| "mean_token_accuracy": 0.7897359609603882, |
| "num_tokens": 5843403693.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 2.8963210702341136, |
| "grad_norm": 0.16349950432777405, |
| "learning_rate": 5.05012881425867e-06, |
| "loss": 0.6863, |
| "mean_token_accuracy": 0.7908827304840088, |
| "num_tokens": 5853803722.0, |
| "step": 2815 |
| }, |
| { |
| "epoch": 2.9014664265500385, |
| "grad_norm": 0.1745273768901825, |
| "learning_rate": 5.045339516718369e-06, |
| "loss": 0.6893, |
| "mean_token_accuracy": 0.7903022348880768, |
| "num_tokens": 5864184533.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 2.9066117828659634, |
| "grad_norm": 0.16056472063064575, |
| "learning_rate": 5.0407899406417626e-06, |
| "loss": 0.6952, |
| "mean_token_accuracy": 0.7885043084621429, |
| "num_tokens": 5874589484.0, |
| "step": 2825 |
| }, |
| { |
| "epoch": 2.9117571391818884, |
| "grad_norm": 0.17720390856266022, |
| "learning_rate": 5.036480232330756e-06, |
| "loss": 0.6936, |
| "mean_token_accuracy": 0.7888238668441773, |
| "num_tokens": 5885001020.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 2.9169024954978133, |
| "grad_norm": 0.16652604937553406, |
| "learning_rate": 5.032410530373764e-06, |
| "loss": 0.7, |
| "mean_token_accuracy": 0.7873083800077438, |
| "num_tokens": 5895408701.0, |
| "step": 2835 |
| }, |
| { |
| "epoch": 2.922047851813738, |
| "grad_norm": 0.17051072418689728, |
| "learning_rate": 5.028580965641256e-06, |
| "loss": 0.6925, |
| "mean_token_accuracy": 0.7890042126178741, |
| "num_tokens": 5905806664.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 2.9271932081296628, |
| "grad_norm": 0.16726641356945038, |
| "learning_rate": 5.024991661281546e-06, |
| "loss": 0.6962, |
| "mean_token_accuracy": 0.7878126442432404, |
| "num_tokens": 5916184097.0, |
| "step": 2845 |
| }, |
| { |
| "epoch": 2.9323385644455877, |
| "grad_norm": 0.16886568069458008, |
| "learning_rate": 5.0216427327168295e-06, |
| "loss": 0.6861, |
| "mean_token_accuracy": 0.7910468071699143, |
| "num_tokens": 5926604231.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 2.9374839207615127, |
| "grad_norm": 0.1645469218492508, |
| "learning_rate": 5.0185342876394775e-06, |
| "loss": 0.6954, |
| "mean_token_accuracy": 0.7883421629667282, |
| "num_tokens": 5937011174.0, |
| "step": 2855 |
| }, |
| { |
| "epoch": 2.9426292770774376, |
| "grad_norm": 0.17823590338230133, |
| "learning_rate": 5.0156664260085695e-06, |
| "loss": 0.6896, |
| "mean_token_accuracy": 0.7900129020214081, |
| "num_tokens": 5947409312.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 2.9477746333933625, |
| "grad_norm": 0.18835224211215973, |
| "learning_rate": 5.0130392400466835e-06, |
| "loss": 0.689, |
| "mean_token_accuracy": 0.7900780767202378, |
| "num_tokens": 5957805326.0, |
| "step": 2865 |
| }, |
| { |
| "epoch": 2.9529199897092875, |
| "grad_norm": 0.1715887039899826, |
| "learning_rate": 5.010652814236921e-06, |
| "loss": 0.6909, |
| "mean_token_accuracy": 0.7899001896381378, |
| "num_tokens": 5968218507.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 2.9580653460252124, |
| "grad_norm": 0.18797433376312256, |
| "learning_rate": 5.008507225320203e-06, |
| "loss": 0.704, |
| "mean_token_accuracy": 0.7860912084579468, |
| "num_tokens": 5978622283.0, |
| "step": 2875 |
| }, |
| { |
| "epoch": 2.9632107023411374, |
| "grad_norm": 0.17873001098632812, |
| "learning_rate": 5.00660254229279e-06, |
| "loss": 0.6968, |
| "mean_token_accuracy": 0.7881993442773819, |
| "num_tokens": 5988997989.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 2.968356058657062, |
| "grad_norm": 0.18319903314113617, |
| "learning_rate": 5.004938826404073e-06, |
| "loss": 0.6993, |
| "mean_token_accuracy": 0.787423062324524, |
| "num_tokens": 5999359936.0, |
| "step": 2885 |
| }, |
| { |
| "epoch": 2.973501414972987, |
| "grad_norm": 0.1805352121591568, |
| "learning_rate": 5.003516131154598e-06, |
| "loss": 0.6972, |
| "mean_token_accuracy": 0.7876656591892243, |
| "num_tokens": 6009761882.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 2.9786467712889118, |
| "grad_norm": 0.18117545545101166, |
| "learning_rate": 5.002334502294346e-06, |
| "loss": 0.6947, |
| "mean_token_accuracy": 0.7882909893989563, |
| "num_tokens": 6020167222.0, |
| "step": 2895 |
| }, |
| { |
| "epoch": 2.9837921276048367, |
| "grad_norm": 0.1723506599664688, |
| "learning_rate": 5.001393977821266e-06, |
| "loss": 0.7042, |
| "mean_token_accuracy": 0.7860449641942978, |
| "num_tokens": 6030555418.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.9889374839207616, |
| "grad_norm": 0.19804443418979645, |
| "learning_rate": 5.00069458798005e-06, |
| "loss": 0.6997, |
| "mean_token_accuracy": 0.7871603965759277, |
| "num_tokens": 6040947002.0, |
| "step": 2905 |
| }, |
| { |
| "epoch": 2.994082840236686, |
| "grad_norm": 0.16171956062316895, |
| "learning_rate": 5.000236355261159e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.787379264831543, |
| "num_tokens": 6051343009.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 2.999228196552611, |
| "grad_norm": 0.17762000858783722, |
| "learning_rate": 5.000019294400102e-06, |
| "loss": 0.6875, |
| "mean_token_accuracy": 0.7905127763748169, |
| "num_tokens": 6061722567.0, |
| "step": 2915 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2916, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 450, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2789425421603045e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|