diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6139 @@ +{ + "best_global_step": 5300, + "best_metric": 1.168828010559082, + "best_model_checkpoint": "/workspace/project_2026_1/checkpoints/sft/checkpoint-5300", + "epoch": 1.6181229773462782, + "eval_steps": 100, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.1349310517311095, + "epoch": 0.0029420417769932335, + "grad_norm": 5.712224006652832, + "learning_rate": 3.5294117647058825e-06, + "loss": 3.9359054565429688, + "mean_token_accuracy": 0.45907036662101747, + "num_tokens": 40265.0, + "step": 10 + }, + { + "entropy": 1.1904624342918395, + "epoch": 0.005884083553986467, + "grad_norm": 3.6260242462158203, + "learning_rate": 7.450980392156863e-06, + "loss": 3.447366714477539, + "mean_token_accuracy": 0.4740333199501038, + "num_tokens": 80768.0, + "step": 20 + }, + { + "entropy": 1.641614854335785, + "epoch": 0.0088261253309797, + "grad_norm": 1.758035659790039, + "learning_rate": 1.1372549019607843e-05, + "loss": 2.9342424392700197, + "mean_token_accuracy": 0.4756320804357529, + "num_tokens": 121287.0, + "step": 30 + }, + { + "entropy": 2.0289795875549315, + "epoch": 0.011768167107972934, + "grad_norm": 0.8422009348869324, + "learning_rate": 1.5294117647058826e-05, + "loss": 2.4612199783325197, + "mean_token_accuracy": 0.5210394144058228, + "num_tokens": 161772.0, + "step": 40 + }, + { + "entropy": 2.1589465618133543, + "epoch": 0.014710208884966167, + "grad_norm": 0.6909259557723999, + "learning_rate": 1.9215686274509807e-05, + "loss": 2.1433944702148438, + "mean_token_accuracy": 0.5696752369403839, + "num_tokens": 202309.0, + "step": 50 + }, + { + "entropy": 1.932080829143524, + "epoch": 0.0176522506619594, + "grad_norm": 0.7911986112594604, + "learning_rate": 2.3137254901960788e-05, + "loss": 1.8811990737915039, + "mean_token_accuracy": 0.6150015771389008, + "num_tokens": 242490.0, + "step": 60 + }, + { + "entropy": 1.6605080842971802, + "epoch": 0.020594292438952635, + "grad_norm": 0.790881872177124, + "learning_rate": 2.7058823529411766e-05, + "loss": 1.6363672256469726, + "mean_token_accuracy": 0.6599553287029266, + "num_tokens": 282845.0, + "step": 70 + }, + { + "entropy": 1.5992008209228517, + "epoch": 0.023536334215945868, + "grad_norm": 0.4468841850757599, + "learning_rate": 3.098039215686275e-05, + "loss": 1.6219806671142578, + "mean_token_accuracy": 0.6577221751213074, + "num_tokens": 323245.0, + "step": 80 + }, + { + "entropy": 1.5317162871360779, + "epoch": 0.0264783759929391, + "grad_norm": 0.49570420384407043, + "learning_rate": 3.4901960784313725e-05, + "loss": 1.5623438835144043, + "mean_token_accuracy": 0.6620652556419373, + "num_tokens": 363709.0, + "step": 90 + }, + { + "entropy": 1.5458295822143555, + "epoch": 0.029420417769932334, + "grad_norm": 0.4536747634410858, + "learning_rate": 3.882352941176471e-05, + "loss": 1.577082633972168, + "mean_token_accuracy": 0.6642140686511994, + "num_tokens": 404093.0, + "step": 100 + }, + { + "epoch": 0.029420417769932334, + "eval_entropy": 1.519468894467892, + "eval_loss": 1.5212204456329346, + "eval_mean_token_accuracy": 0.6755449191478919, + "eval_num_tokens": 404093.0, + "eval_runtime": 116.8522, + "eval_samples_per_second": 26.059, + "eval_steps_per_second": 3.261, + "step": 100 + }, + { + "entropy": 1.5117838263511658, + "epoch": 0.032362459546925564, + "grad_norm": 0.4160189628601074, + "learning_rate": 4.274509803921569e-05, + "loss": 1.5348424911499023, + "mean_token_accuracy": 0.6751707136631012, + "num_tokens": 444665.0, + "step": 110 + }, + { + "entropy": 1.4816083550453185, + "epoch": 0.0353045013239188, + "grad_norm": 0.4050695300102234, + "learning_rate": 4.666666666666667e-05, + "loss": 1.5027993202209473, + "mean_token_accuracy": 0.6793931305408478, + "num_tokens": 485247.0, + "step": 120 + }, + { + "entropy": 1.5376260161399842, + "epoch": 0.03824654310091203, + "grad_norm": 0.4373331367969513, + "learning_rate": 5.058823529411765e-05, + "loss": 1.5507566452026367, + "mean_token_accuracy": 0.6680718421936035, + "num_tokens": 525640.0, + "step": 130 + }, + { + "entropy": 1.568453598022461, + "epoch": 0.04118858487790527, + "grad_norm": 0.41970527172088623, + "learning_rate": 5.450980392156863e-05, + "loss": 1.5769514083862304, + "mean_token_accuracy": 0.6546712338924408, + "num_tokens": 565941.0, + "step": 140 + }, + { + "entropy": 1.4061901330947877, + "epoch": 0.0441306266548985, + "grad_norm": 0.43462952971458435, + "learning_rate": 5.843137254901961e-05, + "loss": 1.4184735298156739, + "mean_token_accuracy": 0.6973597705364227, + "num_tokens": 606491.0, + "step": 150 + }, + { + "entropy": 1.4754432439804077, + "epoch": 0.047072668431891736, + "grad_norm": 0.42188599705696106, + "learning_rate": 6.23529411764706e-05, + "loss": 1.5123021125793457, + "mean_token_accuracy": 0.6780355155467988, + "num_tokens": 646949.0, + "step": 160 + }, + { + "entropy": 1.4351889610290527, + "epoch": 0.05001471020888497, + "grad_norm": 0.4059581458568573, + "learning_rate": 6.627450980392157e-05, + "loss": 1.4382680892944335, + "mean_token_accuracy": 0.6908938407897949, + "num_tokens": 687299.0, + "step": 170 + }, + { + "entropy": 1.4386033058166503, + "epoch": 0.0529567519858782, + "grad_norm": 0.4081089198589325, + "learning_rate": 7.019607843137255e-05, + "loss": 1.4630658149719238, + "mean_token_accuracy": 0.6837093889713287, + "num_tokens": 727817.0, + "step": 180 + }, + { + "entropy": 1.4710845828056336, + "epoch": 0.055898793762871435, + "grad_norm": 0.4348011612892151, + "learning_rate": 7.411764705882354e-05, + "loss": 1.483832550048828, + "mean_token_accuracy": 0.6794860780239105, + "num_tokens": 767982.0, + "step": 190 + }, + { + "entropy": 1.4718186855316162, + "epoch": 0.05884083553986467, + "grad_norm": 0.3977382779121399, + "learning_rate": 7.803921568627451e-05, + "loss": 1.4843897819519043, + "mean_token_accuracy": 0.6810027420520782, + "num_tokens": 808206.0, + "step": 200 + }, + { + "epoch": 0.05884083553986467, + "eval_entropy": 1.4348028574402876, + "eval_loss": 1.4701354503631592, + "eval_mean_token_accuracy": 0.6813244935408664, + "eval_num_tokens": 808206.0, + "eval_runtime": 116.9082, + "eval_samples_per_second": 26.046, + "eval_steps_per_second": 3.259, + "step": 200 + }, + { + "entropy": 1.4585287928581239, + "epoch": 0.0617828773168579, + "grad_norm": 0.41666489839553833, + "learning_rate": 8.196078431372549e-05, + "loss": 1.4701197624206543, + "mean_token_accuracy": 0.6891506373882293, + "num_tokens": 848584.0, + "step": 210 + }, + { + "entropy": 1.404802179336548, + "epoch": 0.06472491909385113, + "grad_norm": 0.4331190288066864, + "learning_rate": 8.588235294117646e-05, + "loss": 1.4301957130432128, + "mean_token_accuracy": 0.6959148466587066, + "num_tokens": 889067.0, + "step": 220 + }, + { + "entropy": 1.419868004322052, + "epoch": 0.06766696087084437, + "grad_norm": 0.4185863435268402, + "learning_rate": 8.980392156862746e-05, + "loss": 1.4252424240112305, + "mean_token_accuracy": 0.6949863255023956, + "num_tokens": 929411.0, + "step": 230 + }, + { + "entropy": 1.4240824103355407, + "epoch": 0.0706090026478376, + "grad_norm": 0.43419042229652405, + "learning_rate": 9.372549019607843e-05, + "loss": 1.4376652717590332, + "mean_token_accuracy": 0.6870559632778168, + "num_tokens": 970055.0, + "step": 240 + }, + { + "entropy": 1.4307694911956788, + "epoch": 0.07355104442483083, + "grad_norm": 0.3987244963645935, + "learning_rate": 9.764705882352942e-05, + "loss": 1.4383943557739258, + "mean_token_accuracy": 0.685227632522583, + "num_tokens": 1010437.0, + "step": 250 + }, + { + "entropy": 1.50078626871109, + "epoch": 0.07649308620182406, + "grad_norm": 0.3530856668949127, + "learning_rate": 0.00010156862745098039, + "loss": 1.5160396575927735, + "mean_token_accuracy": 0.6708036184310913, + "num_tokens": 1051087.0, + "step": 260 + }, + { + "entropy": 1.4214369535446167, + "epoch": 0.0794351279788173, + "grad_norm": 0.35215267539024353, + "learning_rate": 0.00010549019607843139, + "loss": 1.4516315460205078, + "mean_token_accuracy": 0.6868359208106994, + "num_tokens": 1091565.0, + "step": 270 + }, + { + "entropy": 1.4430809259414672, + "epoch": 0.08237716975581054, + "grad_norm": 0.3903290629386902, + "learning_rate": 0.00010941176470588237, + "loss": 1.4475218772888183, + "mean_token_accuracy": 0.686697655916214, + "num_tokens": 1131001.0, + "step": 280 + }, + { + "entropy": 1.4334508419036864, + "epoch": 0.08531921153280377, + "grad_norm": 0.3959214985370636, + "learning_rate": 0.00011333333333333334, + "loss": 1.4455389976501465, + "mean_token_accuracy": 0.6854739308357238, + "num_tokens": 1171541.0, + "step": 290 + }, + { + "entropy": 1.5002384662628174, + "epoch": 0.088261253309797, + "grad_norm": 0.3723829984664917, + "learning_rate": 0.00011725490196078433, + "loss": 1.5168241500854491, + "mean_token_accuracy": 0.66768679022789, + "num_tokens": 1211498.0, + "step": 300 + }, + { + "epoch": 0.088261253309797, + "eval_entropy": 1.4693221822185467, + "eval_loss": 1.448572039604187, + "eval_mean_token_accuracy": 0.6847088332251301, + "eval_num_tokens": 1211498.0, + "eval_runtime": 116.8305, + "eval_samples_per_second": 26.063, + "eval_steps_per_second": 3.261, + "step": 300 + }, + { + "entropy": 1.3928457140922545, + "epoch": 0.09120329508679023, + "grad_norm": 0.4283956289291382, + "learning_rate": 0.0001211764705882353, + "loss": 1.4159896850585938, + "mean_token_accuracy": 0.6940667152404785, + "num_tokens": 1251562.0, + "step": 310 + }, + { + "entropy": 1.3655309796333313, + "epoch": 0.09414533686378347, + "grad_norm": 0.34452083706855774, + "learning_rate": 0.00012509803921568628, + "loss": 1.3777886390686036, + "mean_token_accuracy": 0.7042169332504272, + "num_tokens": 1292029.0, + "step": 320 + }, + { + "entropy": 1.4647573351860046, + "epoch": 0.0970873786407767, + "grad_norm": 0.3209940493106842, + "learning_rate": 0.00012901960784313728, + "loss": 1.4774354934692382, + "mean_token_accuracy": 0.6791389882564545, + "num_tokens": 1332530.0, + "step": 330 + }, + { + "entropy": 1.4148489713668824, + "epoch": 0.10002942041776994, + "grad_norm": 0.4031716585159302, + "learning_rate": 0.00013294117647058822, + "loss": 1.4249235153198243, + "mean_token_accuracy": 0.692065304517746, + "num_tokens": 1372940.0, + "step": 340 + }, + { + "entropy": 1.3905927181243896, + "epoch": 0.10297146219476316, + "grad_norm": 0.40045231580734253, + "learning_rate": 0.00013686274509803922, + "loss": 1.4047340393066405, + "mean_token_accuracy": 0.6952064216136933, + "num_tokens": 1413313.0, + "step": 350 + }, + { + "entropy": 1.393438732624054, + "epoch": 0.1059135039717564, + "grad_norm": 0.3352883458137512, + "learning_rate": 0.00014078431372549022, + "loss": 1.4050509452819824, + "mean_token_accuracy": 0.6900130629539489, + "num_tokens": 1453217.0, + "step": 360 + }, + { + "entropy": 1.3930254817008971, + "epoch": 0.10885554574874963, + "grad_norm": 0.31767141819000244, + "learning_rate": 0.0001447058823529412, + "loss": 1.3919607162475587, + "mean_token_accuracy": 0.6927550494670868, + "num_tokens": 1493584.0, + "step": 370 + }, + { + "entropy": 1.3573615312576295, + "epoch": 0.11179758752574287, + "grad_norm": 0.32946810126304626, + "learning_rate": 0.00014862745098039216, + "loss": 1.3821157455444335, + "mean_token_accuracy": 0.6986469984054565, + "num_tokens": 1534100.0, + "step": 380 + }, + { + "entropy": 1.4580389261245728, + "epoch": 0.1147396293027361, + "grad_norm": 0.3781375288963318, + "learning_rate": 0.00015254901960784313, + "loss": 1.4741416931152345, + "mean_token_accuracy": 0.6809489071369171, + "num_tokens": 1574623.0, + "step": 390 + }, + { + "entropy": 1.4030352354049682, + "epoch": 0.11768167107972934, + "grad_norm": 0.3613905906677246, + "learning_rate": 0.00015647058823529413, + "loss": 1.4134941101074219, + "mean_token_accuracy": 0.6923583388328552, + "num_tokens": 1615045.0, + "step": 400 + }, + { + "epoch": 0.11768167107972934, + "eval_entropy": 1.3998811366989856, + "eval_loss": 1.432859182357788, + "eval_mean_token_accuracy": 0.6876164205430999, + "eval_num_tokens": 1615045.0, + "eval_runtime": 116.8804, + "eval_samples_per_second": 26.052, + "eval_steps_per_second": 3.26, + "step": 400 + }, + { + "entropy": 1.3861081957817079, + "epoch": 0.12062371285672256, + "grad_norm": 0.32785341143608093, + "learning_rate": 0.0001603921568627451, + "loss": 1.4069743156433105, + "mean_token_accuracy": 0.6965118229389191, + "num_tokens": 1655438.0, + "step": 410 + }, + { + "entropy": 1.3972581624984741, + "epoch": 0.1235657546337158, + "grad_norm": 0.3461436331272125, + "learning_rate": 0.00016431372549019607, + "loss": 1.4036929130554199, + "mean_token_accuracy": 0.6954509198665619, + "num_tokens": 1696005.0, + "step": 420 + }, + { + "entropy": 1.3920337677001953, + "epoch": 0.12650779641070903, + "grad_norm": 0.330691933631897, + "learning_rate": 0.00016823529411764707, + "loss": 1.4209896087646485, + "mean_token_accuracy": 0.6902368903160095, + "num_tokens": 1736450.0, + "step": 430 + }, + { + "entropy": 1.447186005115509, + "epoch": 0.12944983818770225, + "grad_norm": 0.3359420895576477, + "learning_rate": 0.00017215686274509807, + "loss": 1.4491106986999511, + "mean_token_accuracy": 0.6825862407684327, + "num_tokens": 1776731.0, + "step": 440 + }, + { + "entropy": 1.3688698172569276, + "epoch": 0.1323918799646955, + "grad_norm": 0.31416866183280945, + "learning_rate": 0.000176078431372549, + "loss": 1.3883934020996094, + "mean_token_accuracy": 0.6974851131439209, + "num_tokens": 1816994.0, + "step": 450 + }, + { + "entropy": 1.4649083375930787, + "epoch": 0.13533392174168873, + "grad_norm": 0.3449016511440277, + "learning_rate": 0.00018, + "loss": 1.4666101455688476, + "mean_token_accuracy": 0.6721475541591644, + "num_tokens": 1857622.0, + "step": 460 + }, + { + "entropy": 1.394101870059967, + "epoch": 0.13827596351868196, + "grad_norm": 0.32241928577423096, + "learning_rate": 0.00018392156862745098, + "loss": 1.4229127883911132, + "mean_token_accuracy": 0.6893933176994324, + "num_tokens": 1898085.0, + "step": 470 + }, + { + "entropy": 1.4726597905158996, + "epoch": 0.1412180052956752, + "grad_norm": 0.3263033926486969, + "learning_rate": 0.00018784313725490198, + "loss": 1.495950412750244, + "mean_token_accuracy": 0.6736144661903382, + "num_tokens": 1938687.0, + "step": 480 + }, + { + "entropy": 1.4047853350639343, + "epoch": 0.14416004707266844, + "grad_norm": 0.32949718832969666, + "learning_rate": 0.00019176470588235295, + "loss": 1.4045245170593261, + "mean_token_accuracy": 0.6931259393692016, + "num_tokens": 1978716.0, + "step": 490 + }, + { + "entropy": 1.4191704154014588, + "epoch": 0.14710208884966167, + "grad_norm": 0.31867218017578125, + "learning_rate": 0.00019568627450980392, + "loss": 1.4353232383728027, + "mean_token_accuracy": 0.68796147108078, + "num_tokens": 2019187.0, + "step": 500 + }, + { + "epoch": 0.14710208884966167, + "eval_entropy": 1.3842839742582926, + "eval_loss": 1.4221439361572266, + "eval_mean_token_accuracy": 0.6893131504534423, + "eval_num_tokens": 2019187.0, + "eval_runtime": 116.869, + "eval_samples_per_second": 26.055, + "eval_steps_per_second": 3.26, + "step": 500 + }, + { + "entropy": 1.4995567321777343, + "epoch": 0.1500441306266549, + "grad_norm": 0.32378971576690674, + "learning_rate": 0.00019960784313725492, + "loss": 1.5118988037109375, + "mean_token_accuracy": 0.6660271644592285, + "num_tokens": 2059838.0, + "step": 510 + }, + { + "entropy": 1.3857444763183593, + "epoch": 0.15298617240364812, + "grad_norm": 0.32856202125549316, + "learning_rate": 0.00019999957403309267, + "loss": 1.3925944328308106, + "mean_token_accuracy": 0.6954998314380646, + "num_tokens": 2099887.0, + "step": 520 + }, + { + "entropy": 1.444947075843811, + "epoch": 0.15592821418064137, + "grad_norm": 0.34244897961616516, + "learning_rate": 0.00019999810155955347, + "loss": 1.4644223213195802, + "mean_token_accuracy": 0.6775131762027741, + "num_tokens": 2139863.0, + "step": 530 + }, + { + "entropy": 1.4525998115539551, + "epoch": 0.1588702559576346, + "grad_norm": 0.3342621624469757, + "learning_rate": 0.00019999557733601503, + "loss": 1.4777566909790039, + "mean_token_accuracy": 0.6743626773357392, + "num_tokens": 2180117.0, + "step": 540 + }, + { + "entropy": 1.3970038414001464, + "epoch": 0.16181229773462782, + "grad_norm": 0.33053645491600037, + "learning_rate": 0.00019999200138902642, + "loss": 1.4096016883850098, + "mean_token_accuracy": 0.6931852400302887, + "num_tokens": 2219829.0, + "step": 550 + }, + { + "entropy": 1.4066221356391906, + "epoch": 0.16475433951162108, + "grad_norm": 0.33736753463745117, + "learning_rate": 0.00019998737375619835, + "loss": 1.4260540008544922, + "mean_token_accuracy": 0.6908943474292755, + "num_tokens": 2260238.0, + "step": 560 + }, + { + "entropy": 1.3926284670829774, + "epoch": 0.1676963812886143, + "grad_norm": 0.3630967140197754, + "learning_rate": 0.0001999816944862029, + "loss": 1.408204936981201, + "mean_token_accuracy": 0.6962983906269073, + "num_tokens": 2300497.0, + "step": 570 + }, + { + "entropy": 1.3583880305290221, + "epoch": 0.17063842306560753, + "grad_norm": 0.3344590365886688, + "learning_rate": 0.0001999749636387729, + "loss": 1.3612471580505372, + "mean_token_accuracy": 0.7032208800315857, + "num_tokens": 2340896.0, + "step": 580 + }, + { + "entropy": 1.3552134156227111, + "epoch": 0.17358046484260076, + "grad_norm": 0.3601242005825043, + "learning_rate": 0.00019996718128470137, + "loss": 1.378493595123291, + "mean_token_accuracy": 0.6997579276561737, + "num_tokens": 2381277.0, + "step": 590 + }, + { + "entropy": 1.4107160449028016, + "epoch": 0.176522506619594, + "grad_norm": 0.38008254766464233, + "learning_rate": 0.00019995834750584078, + "loss": 1.4177864074707032, + "mean_token_accuracy": 0.6893383502960205, + "num_tokens": 2421617.0, + "step": 600 + }, + { + "epoch": 0.176522506619594, + "eval_entropy": 1.3129558118935332, + "eval_loss": 1.4119162559509277, + "eval_mean_token_accuracy": 0.6910557604524408, + "eval_num_tokens": 2421617.0, + "eval_runtime": 117.0182, + "eval_samples_per_second": 26.022, + "eval_steps_per_second": 3.256, + "step": 600 + }, + { + "entropy": 1.3507792115211488, + "epoch": 0.17946454839658724, + "grad_norm": 0.3318082392215729, + "learning_rate": 0.00019994846239510216, + "loss": 1.3647551536560059, + "mean_token_accuracy": 0.7037387728691101, + "num_tokens": 2462059.0, + "step": 610 + }, + { + "entropy": 1.395737397670746, + "epoch": 0.18240659017358046, + "grad_norm": 0.34794482588768005, + "learning_rate": 0.00019993752605645417, + "loss": 1.4275907516479491, + "mean_token_accuracy": 0.6850803017616272, + "num_tokens": 2502114.0, + "step": 620 + }, + { + "entropy": 1.4139577507972718, + "epoch": 0.1853486319505737, + "grad_norm": 0.3424963653087616, + "learning_rate": 0.00019992553860492191, + "loss": 1.4030399322509766, + "mean_token_accuracy": 0.6936035215854645, + "num_tokens": 2542391.0, + "step": 630 + }, + { + "entropy": 1.3600899696350097, + "epoch": 0.18829067372756694, + "grad_norm": 0.4584237039089203, + "learning_rate": 0.00019991250016658578, + "loss": 1.3970141410827637, + "mean_token_accuracy": 0.6953531980514527, + "num_tokens": 2582892.0, + "step": 640 + }, + { + "entropy": 1.4006186485290528, + "epoch": 0.19123271550456017, + "grad_norm": 0.3958089053630829, + "learning_rate": 0.00019989841087858019, + "loss": 1.4342127799987794, + "mean_token_accuracy": 0.6836777746677398, + "num_tokens": 2623322.0, + "step": 650 + }, + { + "entropy": 1.4262026309967042, + "epoch": 0.1941747572815534, + "grad_norm": 0.38685840368270874, + "learning_rate": 0.000199883270889092, + "loss": 1.4162178993225099, + "mean_token_accuracy": 0.6810121357440948, + "num_tokens": 2664008.0, + "step": 660 + }, + { + "entropy": 1.3658849358558656, + "epoch": 0.19711679905854662, + "grad_norm": 0.37710094451904297, + "learning_rate": 0.00019986708035735914, + "loss": 1.390056037902832, + "mean_token_accuracy": 0.6923020124435425, + "num_tokens": 2704594.0, + "step": 670 + }, + { + "entropy": 1.363602066040039, + "epoch": 0.20005884083553988, + "grad_norm": 0.32848283648490906, + "learning_rate": 0.0001998498394536687, + "loss": 1.4025785446166992, + "mean_token_accuracy": 0.6898211300373077, + "num_tokens": 2744928.0, + "step": 680 + }, + { + "entropy": 1.2932772994041444, + "epoch": 0.2030008826125331, + "grad_norm": 0.375255286693573, + "learning_rate": 0.00019983154835935535, + "loss": 1.271113681793213, + "mean_token_accuracy": 0.72461918592453, + "num_tokens": 2785405.0, + "step": 690 + }, + { + "entropy": 1.31307555437088, + "epoch": 0.20594292438952633, + "grad_norm": 0.3716314733028412, + "learning_rate": 0.0001998122072667993, + "loss": 1.3366676330566407, + "mean_token_accuracy": 0.7082294344902038, + "num_tokens": 2825853.0, + "step": 700 + }, + { + "epoch": 0.20594292438952633, + "eval_entropy": 1.43291619208854, + "eval_loss": 1.3930107355117798, + "eval_mean_token_accuracy": 0.6941823344531022, + "eval_num_tokens": 2825853.0, + "eval_runtime": 117.0332, + "eval_samples_per_second": 26.018, + "eval_steps_per_second": 3.255, + "step": 700 + }, + { + "entropy": 1.3790567636489868, + "epoch": 0.20888496616651955, + "grad_norm": 0.3361744284629822, + "learning_rate": 0.0001997918163794244, + "loss": 1.4195181846618652, + "mean_token_accuracy": 0.6918542444705963, + "num_tokens": 2866199.0, + "step": 710 + }, + { + "entropy": 1.4847285509109498, + "epoch": 0.2118270079435128, + "grad_norm": 0.5025491118431091, + "learning_rate": 0.00019977037591169583, + "loss": 1.479151153564453, + "mean_token_accuracy": 0.6787452459335327, + "num_tokens": 2906632.0, + "step": 720 + }, + { + "entropy": 1.3509209036827088, + "epoch": 0.21476904972050603, + "grad_norm": 0.3572346866130829, + "learning_rate": 0.00019974788608911802, + "loss": 1.3688506126403808, + "mean_token_accuracy": 0.6984890639781952, + "num_tokens": 2947203.0, + "step": 730 + }, + { + "entropy": 1.3631734371185302, + "epoch": 0.21771109149749926, + "grad_norm": 0.33028289675712585, + "learning_rate": 0.00019972434714823217, + "loss": 1.389684009552002, + "mean_token_accuracy": 0.7006505787372589, + "num_tokens": 2987399.0, + "step": 740 + }, + { + "entropy": 1.3604696154594422, + "epoch": 0.22065313327449249, + "grad_norm": 0.368522584438324, + "learning_rate": 0.00019969975933661378, + "loss": 1.376063919067383, + "mean_token_accuracy": 0.6953622698783875, + "num_tokens": 3027379.0, + "step": 750 + }, + { + "entropy": 1.4122770190238954, + "epoch": 0.22359517505148574, + "grad_norm": 0.3348753750324249, + "learning_rate": 0.00019967412291287007, + "loss": 1.406267261505127, + "mean_token_accuracy": 0.6962361812591553, + "num_tokens": 3067608.0, + "step": 760 + }, + { + "entropy": 1.357818615436554, + "epoch": 0.22653721682847897, + "grad_norm": 0.34611567854881287, + "learning_rate": 0.00019964743814663725, + "loss": 1.3942573547363282, + "mean_token_accuracy": 0.6927765250205994, + "num_tokens": 3108050.0, + "step": 770 + }, + { + "entropy": 1.3579653978347779, + "epoch": 0.2294792586054722, + "grad_norm": 0.42795485258102417, + "learning_rate": 0.0001996197053185777, + "loss": 1.369248104095459, + "mean_token_accuracy": 0.70080486536026, + "num_tokens": 3148448.0, + "step": 780 + }, + { + "entropy": 1.3725898623466493, + "epoch": 0.23242130038246542, + "grad_norm": 0.37439125776290894, + "learning_rate": 0.00019959092472037688, + "loss": 1.3844552040100098, + "mean_token_accuracy": 0.6901408195495605, + "num_tokens": 3189104.0, + "step": 790 + }, + { + "entropy": 1.3633452892303466, + "epoch": 0.23536334215945867, + "grad_norm": 0.3426375091075897, + "learning_rate": 0.0001995610966547406, + "loss": 1.3828603744506835, + "mean_token_accuracy": 0.699894517660141, + "num_tokens": 3229594.0, + "step": 800 + }, + { + "epoch": 0.23536334215945867, + "eval_entropy": 1.3796262137220288, + "eval_loss": 1.3804020881652832, + "eval_mean_token_accuracy": 0.6970919832157025, + "eval_num_tokens": 3229594.0, + "eval_runtime": 117.0459, + "eval_samples_per_second": 26.015, + "eval_steps_per_second": 3.255, + "step": 800 + }, + { + "entropy": 1.3366303324699402, + "epoch": 0.2383053839364519, + "grad_norm": 0.31953537464141846, + "learning_rate": 0.00019953022143539144, + "loss": 1.3653806686401366, + "mean_token_accuracy": 0.7049518287181854, + "num_tokens": 3269982.0, + "step": 810 + }, + { + "entropy": 1.3933040380477906, + "epoch": 0.24124742571344512, + "grad_norm": 0.39059045910835266, + "learning_rate": 0.00019949829938706567, + "loss": 1.4077239990234376, + "mean_token_accuracy": 0.6878176212310791, + "num_tokens": 3310501.0, + "step": 820 + }, + { + "entropy": 1.4102402210235596, + "epoch": 0.24418946749043838, + "grad_norm": 0.40408599376678467, + "learning_rate": 0.00019946533084550983, + "loss": 1.42384614944458, + "mean_token_accuracy": 0.6878543615341186, + "num_tokens": 3350905.0, + "step": 830 + }, + { + "entropy": 1.3664647340774536, + "epoch": 0.2471315092674316, + "grad_norm": 0.33828359842300415, + "learning_rate": 0.00019943131615747715, + "loss": 1.3716326713562013, + "mean_token_accuracy": 0.7006340861320496, + "num_tokens": 3391405.0, + "step": 840 + }, + { + "entropy": 1.427152693271637, + "epoch": 0.25007355104442486, + "grad_norm": 0.48738211393356323, + "learning_rate": 0.00019939625568072387, + "loss": 1.4487317085266114, + "mean_token_accuracy": 0.6779101848602295, + "num_tokens": 3431764.0, + "step": 850 + }, + { + "entropy": 1.3256952285766601, + "epoch": 0.25301559282141806, + "grad_norm": 0.4037957191467285, + "learning_rate": 0.00019936014978400558, + "loss": 1.341677474975586, + "mean_token_accuracy": 0.7102087676525116, + "num_tokens": 3472134.0, + "step": 860 + }, + { + "entropy": 1.3219331979751587, + "epoch": 0.2559576345984113, + "grad_norm": 0.3991295099258423, + "learning_rate": 0.00019932299884707324, + "loss": 1.3323281288146973, + "mean_token_accuracy": 0.7040995419025421, + "num_tokens": 3512618.0, + "step": 870 + }, + { + "entropy": 1.327842366695404, + "epoch": 0.2588996763754045, + "grad_norm": 0.37586092948913574, + "learning_rate": 0.00019928480326066925, + "loss": 1.3401626586914062, + "mean_token_accuracy": 0.7077195107936859, + "num_tokens": 3553039.0, + "step": 880 + }, + { + "entropy": 1.4176750898361206, + "epoch": 0.26184171815239776, + "grad_norm": 0.3479557931423187, + "learning_rate": 0.00019924556342652334, + "loss": 1.435785961151123, + "mean_token_accuracy": 0.6839641869068146, + "num_tokens": 3593551.0, + "step": 890 + }, + { + "entropy": 1.3467723608016968, + "epoch": 0.264783759929391, + "grad_norm": 0.3519476056098938, + "learning_rate": 0.00019920527975734827, + "loss": 1.3625640869140625, + "mean_token_accuracy": 0.6986214816570282, + "num_tokens": 3633984.0, + "step": 900 + }, + { + "epoch": 0.264783759929391, + "eval_entropy": 1.327493928861743, + "eval_loss": 1.3682215213775635, + "eval_mean_token_accuracy": 0.6995677048452883, + "eval_num_tokens": 3633984.0, + "eval_runtime": 117.0553, + "eval_samples_per_second": 26.013, + "eval_steps_per_second": 3.255, + "step": 900 + }, + { + "entropy": 1.377519929409027, + "epoch": 0.2677258017063842, + "grad_norm": 0.3842776417732239, + "learning_rate": 0.0001991639526768356, + "loss": 1.4039417266845704, + "mean_token_accuracy": 0.6875424087047577, + "num_tokens": 3674545.0, + "step": 910 + }, + { + "entropy": 1.314482867717743, + "epoch": 0.27066784348337747, + "grad_norm": 0.4236571192741394, + "learning_rate": 0.00019912158261965114, + "loss": 1.3185301780700684, + "mean_token_accuracy": 0.7092587351799011, + "num_tokens": 3714993.0, + "step": 920 + }, + { + "entropy": 1.3224342346191407, + "epoch": 0.2736098852603707, + "grad_norm": 0.3825512230396271, + "learning_rate": 0.0001990781700314304, + "loss": 1.3342713356018066, + "mean_token_accuracy": 0.7061914503574371, + "num_tokens": 3755443.0, + "step": 930 + }, + { + "entropy": 1.3411121726036073, + "epoch": 0.2765519270373639, + "grad_norm": 0.3913728594779968, + "learning_rate": 0.00019903371536877397, + "loss": 1.372488784790039, + "mean_token_accuracy": 0.6965407311916352, + "num_tokens": 3795944.0, + "step": 940 + }, + { + "entropy": 1.3303414344787599, + "epoch": 0.2794939688143572, + "grad_norm": 0.34784528613090515, + "learning_rate": 0.00019898821909924267, + "loss": 1.329643440246582, + "mean_token_accuracy": 0.7043894708156586, + "num_tokens": 3836572.0, + "step": 950 + }, + { + "entropy": 1.3415798902511598, + "epoch": 0.2824360105913504, + "grad_norm": 0.3249607980251312, + "learning_rate": 0.00019894168170135252, + "loss": 1.3672316551208497, + "mean_token_accuracy": 0.7032464861869812, + "num_tokens": 3876943.0, + "step": 960 + }, + { + "entropy": 1.3220824122428894, + "epoch": 0.2853780523683436, + "grad_norm": 0.3845159411430359, + "learning_rate": 0.00019889410366456995, + "loss": 1.326685905456543, + "mean_token_accuracy": 0.7063803553581238, + "num_tokens": 3917514.0, + "step": 970 + }, + { + "entropy": 1.3524356603622436, + "epoch": 0.2883200941453369, + "grad_norm": 0.49597394466400146, + "learning_rate": 0.00019884548548930648, + "loss": 1.3823152542114259, + "mean_token_accuracy": 0.688988733291626, + "num_tokens": 3958016.0, + "step": 980 + }, + { + "entropy": 1.3366973161697389, + "epoch": 0.2912621359223301, + "grad_norm": 0.3381548523902893, + "learning_rate": 0.00019879582768691343, + "loss": 1.3448416709899902, + "mean_token_accuracy": 0.7067325949668884, + "num_tokens": 3998636.0, + "step": 990 + }, + { + "entropy": 1.342777681350708, + "epoch": 0.29420417769932333, + "grad_norm": 0.3816724717617035, + "learning_rate": 0.0001987451307796767, + "loss": 1.3505562782287597, + "mean_token_accuracy": 0.7011382758617402, + "num_tokens": 4039116.0, + "step": 1000 + }, + { + "epoch": 0.29420417769932333, + "eval_entropy": 1.3210531396502898, + "eval_loss": 1.3546180725097656, + "eval_mean_token_accuracy": 0.7016779092040275, + "eval_num_tokens": 4039116.0, + "eval_runtime": 116.9387, + "eval_samples_per_second": 26.039, + "eval_steps_per_second": 3.258, + "step": 1000 + }, + { + "entropy": 1.366915476322174, + "epoch": 0.2971462194763166, + "grad_norm": 0.35182058811187744, + "learning_rate": 0.00019869339530081105, + "loss": 1.3859369277954101, + "mean_token_accuracy": 0.6906876146793366, + "num_tokens": 4079633.0, + "step": 1010 + }, + { + "entropy": 1.3304717183113097, + "epoch": 0.3000882612533098, + "grad_norm": 0.3552508056163788, + "learning_rate": 0.0001986406217944548, + "loss": 1.3530879974365235, + "mean_token_accuracy": 0.703288596868515, + "num_tokens": 4120199.0, + "step": 1020 + }, + { + "entropy": 1.3517386436462402, + "epoch": 0.30303030303030304, + "grad_norm": 0.3985072076320648, + "learning_rate": 0.0001985868108156638, + "loss": 1.3472537040710448, + "mean_token_accuracy": 0.7062719881534576, + "num_tokens": 4160213.0, + "step": 1030 + }, + { + "entropy": 1.2475824117660523, + "epoch": 0.30597234480729624, + "grad_norm": 0.35150817036628723, + "learning_rate": 0.00019853196293040577, + "loss": 1.2721343040466309, + "mean_token_accuracy": 0.7261709928512573, + "num_tokens": 4200515.0, + "step": 1040 + }, + { + "entropy": 1.3748682379722594, + "epoch": 0.3089143865842895, + "grad_norm": 0.3976305425167084, + "learning_rate": 0.00019847607871555426, + "loss": 1.3997004508972168, + "mean_token_accuracy": 0.6935756504535675, + "num_tokens": 4240452.0, + "step": 1050 + }, + { + "entropy": 1.250860321521759, + "epoch": 0.31185642836128274, + "grad_norm": 0.4553461968898773, + "learning_rate": 0.00019841915875888272, + "loss": 1.2498172760009765, + "mean_token_accuracy": 0.7244091928005219, + "num_tokens": 4280962.0, + "step": 1060 + }, + { + "entropy": 1.3760404348373414, + "epoch": 0.31479847013827594, + "grad_norm": 0.489533394575119, + "learning_rate": 0.00019836120365905813, + "loss": 1.3888651847839355, + "mean_token_accuracy": 0.693582957983017, + "num_tokens": 4321434.0, + "step": 1070 + }, + { + "entropy": 1.3051153898239136, + "epoch": 0.3177405119152692, + "grad_norm": 0.3459921181201935, + "learning_rate": 0.0001983022140256348, + "loss": 1.3199047088623046, + "mean_token_accuracy": 0.7050609290599823, + "num_tokens": 4361714.0, + "step": 1080 + }, + { + "entropy": 1.2985354661941528, + "epoch": 0.32068255369226245, + "grad_norm": 0.33598437905311584, + "learning_rate": 0.00019824219047904804, + "loss": 1.3020724296569823, + "mean_token_accuracy": 0.713257223367691, + "num_tokens": 4401949.0, + "step": 1090 + }, + { + "entropy": 1.276042139530182, + "epoch": 0.32362459546925565, + "grad_norm": 0.43967100977897644, + "learning_rate": 0.00019818113365060742, + "loss": 1.3166107177734374, + "mean_token_accuracy": 0.7159667015075684, + "num_tokens": 4442174.0, + "step": 1100 + }, + { + "epoch": 0.32362459546925565, + "eval_entropy": 1.3324664305514238, + "eval_loss": 1.3422644138336182, + "eval_mean_token_accuracy": 0.7045243512301307, + "eval_num_tokens": 4442174.0, + "eval_runtime": 116.854, + "eval_samples_per_second": 26.058, + "eval_steps_per_second": 3.26, + "step": 1100 + }, + { + "entropy": 1.3644051671028137, + "epoch": 0.3265666372462489, + "grad_norm": 0.3831937313079834, + "learning_rate": 0.0001981190441824903, + "loss": 1.3389662742614745, + "mean_token_accuracy": 0.7077198505401612, + "num_tokens": 4482638.0, + "step": 1110 + }, + { + "entropy": 1.3949143767356873, + "epoch": 0.32950867902324216, + "grad_norm": 0.4152087867259979, + "learning_rate": 0.0001980559227277352, + "loss": 1.4393955230712892, + "mean_token_accuracy": 0.6806177437305451, + "num_tokens": 4523231.0, + "step": 1120 + }, + { + "entropy": 1.5005442142486571, + "epoch": 0.33245072080023536, + "grad_norm": 0.3600977957248688, + "learning_rate": 0.00019799176995023446, + "loss": 1.5087374687194823, + "mean_token_accuracy": 0.6630822360515595, + "num_tokens": 4563875.0, + "step": 1130 + }, + { + "entropy": 1.3872592091560363, + "epoch": 0.3353927625772286, + "grad_norm": 0.41946983337402344, + "learning_rate": 0.00019792658652472784, + "loss": 1.3950308799743651, + "mean_token_accuracy": 0.6971809685230255, + "num_tokens": 4604493.0, + "step": 1140 + }, + { + "entropy": 1.2849906086921692, + "epoch": 0.3383348043542218, + "grad_norm": 0.44172239303588867, + "learning_rate": 0.00019786037313679496, + "loss": 1.3040260314941405, + "mean_token_accuracy": 0.7124627947807312, + "num_tokens": 4644971.0, + "step": 1150 + }, + { + "entropy": 1.3872546672821044, + "epoch": 0.34127684613121506, + "grad_norm": 0.42291128635406494, + "learning_rate": 0.0001977931304828484, + "loss": 1.3932037353515625, + "mean_token_accuracy": 0.6873931646347046, + "num_tokens": 4685402.0, + "step": 1160 + }, + { + "entropy": 1.3018505930900575, + "epoch": 0.3442188879082083, + "grad_norm": 0.40709924697875977, + "learning_rate": 0.00019772485927012617, + "loss": 1.3295734405517579, + "mean_token_accuracy": 0.708010071516037, + "num_tokens": 4725827.0, + "step": 1170 + }, + { + "entropy": 1.306389570236206, + "epoch": 0.3471609296852015, + "grad_norm": 0.3591320514678955, + "learning_rate": 0.00019765556021668438, + "loss": 1.2882349014282226, + "mean_token_accuracy": 0.7196535110473633, + "num_tokens": 4766299.0, + "step": 1180 + }, + { + "entropy": 1.274339497089386, + "epoch": 0.35010297146219477, + "grad_norm": 0.49734798073768616, + "learning_rate": 0.0001975852340513897, + "loss": 1.322108268737793, + "mean_token_accuracy": 0.7085251092910767, + "num_tokens": 4806716.0, + "step": 1190 + }, + { + "entropy": 1.3069317936897278, + "epoch": 0.353045013239188, + "grad_norm": 0.38912537693977356, + "learning_rate": 0.00019751388151391153, + "loss": 1.2966851234436034, + "mean_token_accuracy": 0.7151984930038452, + "num_tokens": 4846722.0, + "step": 1200 + }, + { + "epoch": 0.353045013239188, + "eval_entropy": 1.2881334237852122, + "eval_loss": 1.3329390287399292, + "eval_mean_token_accuracy": 0.7064482895095204, + "eval_num_tokens": 4846722.0, + "eval_runtime": 116.8679, + "eval_samples_per_second": 26.055, + "eval_steps_per_second": 3.26, + "step": 1200 + }, + { + "entropy": 1.323479461669922, + "epoch": 0.3559870550161812, + "grad_norm": 0.3935072720050812, + "learning_rate": 0.0001974415033547145, + "loss": 1.3559471130371095, + "mean_token_accuracy": 0.6980177283287048, + "num_tokens": 4887382.0, + "step": 1210 + }, + { + "entropy": 1.3160091280937194, + "epoch": 0.3589290967931745, + "grad_norm": 0.402215838432312, + "learning_rate": 0.00019736810033505037, + "loss": 1.3169782638549805, + "mean_token_accuracy": 0.7115081787109375, + "num_tokens": 4927725.0, + "step": 1220 + }, + { + "entropy": 1.3298014521598815, + "epoch": 0.36187113857016767, + "grad_norm": 0.4016437530517578, + "learning_rate": 0.00019729367322695, + "loss": 1.340937900543213, + "mean_token_accuracy": 0.7020221769809722, + "num_tokens": 4968297.0, + "step": 1230 + }, + { + "entropy": 1.3313292741775513, + "epoch": 0.3648131803471609, + "grad_norm": 0.4023797810077667, + "learning_rate": 0.00019721822281321537, + "loss": 1.3570178031921387, + "mean_token_accuracy": 0.695677000284195, + "num_tokens": 5008063.0, + "step": 1240 + }, + { + "entropy": 1.3236193537712098, + "epoch": 0.3677552221241542, + "grad_norm": 0.39722758531570435, + "learning_rate": 0.00019714174988741127, + "loss": 1.3275541305541991, + "mean_token_accuracy": 0.7125855982303619, + "num_tokens": 5048336.0, + "step": 1250 + }, + { + "entropy": 1.3190148949623108, + "epoch": 0.3706972639011474, + "grad_norm": 0.35919976234436035, + "learning_rate": 0.000197064255253857, + "loss": 1.3260068893432617, + "mean_token_accuracy": 0.7106278300285339, + "num_tokens": 5088942.0, + "step": 1260 + }, + { + "entropy": 1.3633437037467957, + "epoch": 0.37363930567814063, + "grad_norm": 0.41243675351142883, + "learning_rate": 0.0001969857397276178, + "loss": 1.3687942504882813, + "mean_token_accuracy": 0.6920778334140778, + "num_tokens": 5129372.0, + "step": 1270 + }, + { + "entropy": 1.2383417725563048, + "epoch": 0.3765813474551339, + "grad_norm": 0.40572160482406616, + "learning_rate": 0.00019690620413449642, + "loss": 1.2481071472167968, + "mean_token_accuracy": 0.7250577747821808, + "num_tokens": 5169845.0, + "step": 1280 + }, + { + "entropy": 1.2865507364273072, + "epoch": 0.3795233892321271, + "grad_norm": 0.42980971932411194, + "learning_rate": 0.00019682564931102435, + "loss": 1.3222503662109375, + "mean_token_accuracy": 0.7078245043754577, + "num_tokens": 5210494.0, + "step": 1290 + }, + { + "entropy": 1.3427109718322754, + "epoch": 0.38246543100912034, + "grad_norm": 0.39987629652023315, + "learning_rate": 0.000196744076104453, + "loss": 1.3458648681640626, + "mean_token_accuracy": 0.7039949476718903, + "num_tokens": 5250876.0, + "step": 1300 + }, + { + "epoch": 0.38246543100912034, + "eval_entropy": 1.3054086448639397, + "eval_loss": 1.328322172164917, + "eval_mean_token_accuracy": 0.7073736231470984, + "eval_num_tokens": 5250876.0, + "eval_runtime": 116.8194, + "eval_samples_per_second": 26.066, + "eval_steps_per_second": 3.261, + "step": 1300 + }, + { + "entropy": 1.3619997262954713, + "epoch": 0.38540747278611354, + "grad_norm": 0.39274510741233826, + "learning_rate": 0.00019666148537274486, + "loss": 1.393809986114502, + "mean_token_accuracy": 0.6958752512931824, + "num_tokens": 5291408.0, + "step": 1310 + }, + { + "entropy": 1.313580584526062, + "epoch": 0.3883495145631068, + "grad_norm": 0.4472017288208008, + "learning_rate": 0.00019657787798456447, + "loss": 1.3207698822021485, + "mean_token_accuracy": 0.7056093811988831, + "num_tokens": 5331902.0, + "step": 1320 + }, + { + "entropy": 1.3660961747169496, + "epoch": 0.39129155634010004, + "grad_norm": 0.3704398572444916, + "learning_rate": 0.00019649325481926918, + "loss": 1.3579423904418946, + "mean_token_accuracy": 0.6998582005500793, + "num_tokens": 5372344.0, + "step": 1330 + }, + { + "entropy": 1.2838171362876891, + "epoch": 0.39423359811709324, + "grad_norm": 0.42561763525009155, + "learning_rate": 0.0001964076167669001, + "loss": 1.3317262649536132, + "mean_token_accuracy": 0.707372397184372, + "num_tokens": 5412588.0, + "step": 1340 + }, + { + "entropy": 1.3983584880828857, + "epoch": 0.3971756398940865, + "grad_norm": 0.39428314566612244, + "learning_rate": 0.00019632096472817247, + "loss": 1.3956350326538085, + "mean_token_accuracy": 0.6877002000808716, + "num_tokens": 5452540.0, + "step": 1350 + }, + { + "entropy": 1.2444414377212525, + "epoch": 0.40011768167107975, + "grad_norm": 0.46192431449890137, + "learning_rate": 0.00019623329961446646, + "loss": 1.2581539154052734, + "mean_token_accuracy": 0.7236546576023102, + "num_tokens": 5492713.0, + "step": 1360 + }, + { + "entropy": 1.3872360110282898, + "epoch": 0.40305972344807295, + "grad_norm": 0.4279949367046356, + "learning_rate": 0.0001961446223478174, + "loss": 1.3998414993286132, + "mean_token_accuracy": 0.6933107733726501, + "num_tokens": 5532960.0, + "step": 1370 + }, + { + "entropy": 1.3212559223175049, + "epoch": 0.4060017652250662, + "grad_norm": 0.5924211740493774, + "learning_rate": 0.0001960549338609061, + "loss": 1.3529240608215332, + "mean_token_accuracy": 0.7018321037292481, + "num_tokens": 5573284.0, + "step": 1380 + }, + { + "entropy": 1.2787784337997437, + "epoch": 0.40894380700205946, + "grad_norm": 0.4120098948478699, + "learning_rate": 0.00019596423509704916, + "loss": 1.2666643142700196, + "mean_token_accuracy": 0.7207745730876922, + "num_tokens": 5613401.0, + "step": 1390 + }, + { + "entropy": 1.2354493618011475, + "epoch": 0.41188584877905265, + "grad_norm": 0.3948381245136261, + "learning_rate": 0.00019587252701018897, + "loss": 1.258436393737793, + "mean_token_accuracy": 0.7286782383918762, + "num_tokens": 5653527.0, + "step": 1400 + }, + { + "epoch": 0.41188584877905265, + "eval_entropy": 1.3250519281297217, + "eval_loss": 1.317603588104248, + "eval_mean_token_accuracy": 0.709270513902499, + "eval_num_tokens": 5653527.0, + "eval_runtime": 117.0125, + "eval_samples_per_second": 26.023, + "eval_steps_per_second": 3.256, + "step": 1400 + }, + { + "entropy": 1.2924874544143676, + "epoch": 0.4148278905560459, + "grad_norm": 0.3901238739490509, + "learning_rate": 0.0001957798105648836, + "loss": 1.3112905502319336, + "mean_token_accuracy": 0.7130284488201142, + "num_tokens": 5693830.0, + "step": 1410 + }, + { + "entropy": 1.3326545000076293, + "epoch": 0.4177699323330391, + "grad_norm": 0.4452548623085022, + "learning_rate": 0.0001956860867362968, + "loss": 1.3477538108825684, + "mean_token_accuracy": 0.6966541707515717, + "num_tokens": 5734423.0, + "step": 1420 + }, + { + "entropy": 1.3726879000663756, + "epoch": 0.42071197411003236, + "grad_norm": 0.41140350699424744, + "learning_rate": 0.00019559135651018764, + "loss": 1.364595603942871, + "mean_token_accuracy": 0.6968912184238434, + "num_tokens": 5774991.0, + "step": 1430 + }, + { + "entropy": 1.2218275964260101, + "epoch": 0.4236540158870256, + "grad_norm": 0.34981605410575867, + "learning_rate": 0.0001954956208829002, + "loss": 1.255105686187744, + "mean_token_accuracy": 0.7247668862342834, + "num_tokens": 5815447.0, + "step": 1440 + }, + { + "entropy": 1.277131152153015, + "epoch": 0.4265960576640188, + "grad_norm": 0.4556473195552826, + "learning_rate": 0.00019539888086135302, + "loss": 1.2920147895812988, + "mean_token_accuracy": 0.7172133564949036, + "num_tokens": 5855958.0, + "step": 1450 + }, + { + "entropy": 1.2765482187271118, + "epoch": 0.42953809944101207, + "grad_norm": 0.3995361924171448, + "learning_rate": 0.00019530113746302864, + "loss": 1.2754140853881837, + "mean_token_accuracy": 0.7180281519889832, + "num_tokens": 5896270.0, + "step": 1460 + }, + { + "entropy": 1.3740562319755554, + "epoch": 0.4324801412180053, + "grad_norm": 0.42559972405433655, + "learning_rate": 0.00019520239171596276, + "loss": 1.4036828994750976, + "mean_token_accuracy": 0.6909762322902679, + "num_tokens": 5936780.0, + "step": 1470 + }, + { + "entropy": 1.3417500495910644, + "epoch": 0.4354221829949985, + "grad_norm": 0.36076462268829346, + "learning_rate": 0.00019510264465873344, + "loss": 1.3366243362426757, + "mean_token_accuracy": 0.7046165943145752, + "num_tokens": 5977191.0, + "step": 1480 + }, + { + "entropy": 1.2735178232192994, + "epoch": 0.4383642247719918, + "grad_norm": 0.4245460033416748, + "learning_rate": 0.0001950018973404503, + "loss": 1.296212387084961, + "mean_token_accuracy": 0.7112137854099274, + "num_tokens": 6017786.0, + "step": 1490 + }, + { + "entropy": 1.2875243425369263, + "epoch": 0.44130626654898497, + "grad_norm": 0.39781293272972107, + "learning_rate": 0.00019490015082074342, + "loss": 1.2946128845214844, + "mean_token_accuracy": 0.7150110900402069, + "num_tokens": 6058171.0, + "step": 1500 + }, + { + "epoch": 0.44130626654898497, + "eval_entropy": 1.2906748921539526, + "eval_loss": 1.3084396123886108, + "eval_mean_token_accuracy": 0.711328562789076, + "eval_num_tokens": 6058171.0, + "eval_runtime": 116.9727, + "eval_samples_per_second": 26.032, + "eval_steps_per_second": 3.257, + "step": 1500 + }, + { + "entropy": 1.335238778591156, + "epoch": 0.4442483083259782, + "grad_norm": 0.46100911498069763, + "learning_rate": 0.00019479740616975207, + "loss": 1.3694096565246583, + "mean_token_accuracy": 0.6983109295368195, + "num_tokens": 6098236.0, + "step": 1510 + }, + { + "entropy": 1.3258445739746094, + "epoch": 0.4471903501029715, + "grad_norm": 0.4548512399196625, + "learning_rate": 0.00019469366446811368, + "loss": 1.3223464012145996, + "mean_token_accuracy": 0.7081819117069245, + "num_tokens": 6138323.0, + "step": 1520 + }, + { + "entropy": 1.4055490851402284, + "epoch": 0.4501323918799647, + "grad_norm": 0.5857915282249451, + "learning_rate": 0.0001945889268069523, + "loss": 1.4265625, + "mean_token_accuracy": 0.6870070040225983, + "num_tokens": 6178759.0, + "step": 1530 + }, + { + "entropy": 1.3535587549209596, + "epoch": 0.45307443365695793, + "grad_norm": 0.4886157810688019, + "learning_rate": 0.00019448319428786714, + "loss": 1.3616491317749024, + "mean_token_accuracy": 0.6958298921585083, + "num_tokens": 6219216.0, + "step": 1540 + }, + { + "entropy": 1.2599135279655456, + "epoch": 0.4560164754339512, + "grad_norm": 0.4452349543571472, + "learning_rate": 0.00019437646802292116, + "loss": 1.2533982276916504, + "mean_token_accuracy": 0.7242700159549713, + "num_tokens": 6259618.0, + "step": 1550 + }, + { + "entropy": 1.247275459766388, + "epoch": 0.4589585172109444, + "grad_norm": 0.43969377875328064, + "learning_rate": 0.0001942687491346291, + "loss": 1.3022977828979492, + "mean_token_accuracy": 0.7166135847568512, + "num_tokens": 6300125.0, + "step": 1560 + }, + { + "entropy": 1.2933772325515747, + "epoch": 0.46190055898793764, + "grad_norm": 0.4098397195339203, + "learning_rate": 0.0001941600387559459, + "loss": 1.2829959869384766, + "mean_token_accuracy": 0.7151767909526825, + "num_tokens": 6340600.0, + "step": 1570 + }, + { + "entropy": 1.3236228585243226, + "epoch": 0.46484260076493084, + "grad_norm": 0.9286667108535767, + "learning_rate": 0.0001940503380302547, + "loss": 1.3246389389038087, + "mean_token_accuracy": 0.7080819308757782, + "num_tokens": 6381016.0, + "step": 1580 + }, + { + "entropy": 1.3100301146507263, + "epoch": 0.4677846425419241, + "grad_norm": 0.44687584042549133, + "learning_rate": 0.00019393964811135475, + "loss": 1.3230223655700684, + "mean_token_accuracy": 0.7061593413352967, + "num_tokens": 6421104.0, + "step": 1590 + }, + { + "entropy": 1.3405604600906371, + "epoch": 0.47072668431891734, + "grad_norm": 0.4198276400566101, + "learning_rate": 0.00019382797016344937, + "loss": 1.372106170654297, + "mean_token_accuracy": 0.6942126870155334, + "num_tokens": 6461464.0, + "step": 1600 + }, + { + "epoch": 0.47072668431891734, + "eval_entropy": 1.3327079477898405, + "eval_loss": 1.3015934228897095, + "eval_mean_token_accuracy": 0.7128373644170486, + "eval_num_tokens": 6461464.0, + "eval_runtime": 116.8886, + "eval_samples_per_second": 26.05, + "eval_steps_per_second": 3.26, + "step": 1600 + }, + { + "entropy": 1.2775195240974426, + "epoch": 0.47366872609591054, + "grad_norm": 0.36669495701789856, + "learning_rate": 0.00019371530536113372, + "loss": 1.2723214149475097, + "mean_token_accuracy": 0.7203551054000854, + "num_tokens": 6501717.0, + "step": 1610 + }, + { + "entropy": 1.263567042350769, + "epoch": 0.4766107678729038, + "grad_norm": 0.3709637522697449, + "learning_rate": 0.00019360165488938228, + "loss": 1.2917292594909668, + "mean_token_accuracy": 0.7129571974277497, + "num_tokens": 6542157.0, + "step": 1620 + }, + { + "entropy": 1.2828770697116851, + "epoch": 0.47955280964989705, + "grad_norm": 0.40589454770088196, + "learning_rate": 0.00019348701994353662, + "loss": 1.2924720764160156, + "mean_token_accuracy": 0.7142295658588409, + "num_tokens": 6582169.0, + "step": 1630 + }, + { + "entropy": 1.304066789150238, + "epoch": 0.48249485142689025, + "grad_norm": 0.41759273409843445, + "learning_rate": 0.0001933714017292927, + "loss": 1.3207477569580077, + "mean_token_accuracy": 0.7090053021907806, + "num_tokens": 6622526.0, + "step": 1640 + }, + { + "entropy": 1.2956494092941284, + "epoch": 0.4854368932038835, + "grad_norm": 0.49899545311927795, + "learning_rate": 0.00019325480146268812, + "loss": 1.3091160774230957, + "mean_token_accuracy": 0.7111666679382325, + "num_tokens": 6663165.0, + "step": 1650 + }, + { + "entropy": 1.2508437275886535, + "epoch": 0.48837893498087676, + "grad_norm": 0.4445253610610962, + "learning_rate": 0.0001931372203700895, + "loss": 1.2658663749694825, + "mean_token_accuracy": 0.7174579679965973, + "num_tokens": 6703494.0, + "step": 1660 + }, + { + "entropy": 1.287984347343445, + "epoch": 0.49132097675786995, + "grad_norm": 0.4067074656486511, + "learning_rate": 0.00019301865968817948, + "loss": 1.317389678955078, + "mean_token_accuracy": 0.7164031744003296, + "num_tokens": 6743946.0, + "step": 1670 + }, + { + "entropy": 1.272541528940201, + "epoch": 0.4942630185348632, + "grad_norm": 0.36161571741104126, + "learning_rate": 0.0001928991206639436, + "loss": 1.2725687980651856, + "mean_token_accuracy": 0.7200010478496551, + "num_tokens": 6784135.0, + "step": 1680 + }, + { + "entropy": 1.2077086687088012, + "epoch": 0.4972050603118564, + "grad_norm": 0.45158880949020386, + "learning_rate": 0.00019277860455465753, + "loss": 1.2226747512817382, + "mean_token_accuracy": 0.7305562674999238, + "num_tokens": 6824602.0, + "step": 1690 + }, + { + "entropy": 1.3152117371559142, + "epoch": 0.5001471020888497, + "grad_norm": 0.4278320074081421, + "learning_rate": 0.00019265711262787347, + "loss": 1.3219596862792968, + "mean_token_accuracy": 0.7065966606140137, + "num_tokens": 6864339.0, + "step": 1700 + }, + { + "epoch": 0.5001471020888497, + "eval_entropy": 1.2670136558727956, + "eval_loss": 1.2904326915740967, + "eval_mean_token_accuracy": 0.7152716207692004, + "eval_num_tokens": 6864339.0, + "eval_runtime": 116.9086, + "eval_samples_per_second": 26.046, + "eval_steps_per_second": 3.259, + "step": 1700 + }, + { + "entropy": 1.3221548914909362, + "epoch": 0.5030891438658429, + "grad_norm": 0.3975633382797241, + "learning_rate": 0.00019253464616140702, + "loss": 1.349279022216797, + "mean_token_accuracy": 0.7058489739894866, + "num_tokens": 6904961.0, + "step": 1710 + }, + { + "entropy": 1.3088136434555053, + "epoch": 0.5060311856428361, + "grad_norm": 0.40347951650619507, + "learning_rate": 0.00019241120644332367, + "loss": 1.3145333290100099, + "mean_token_accuracy": 0.7059141278266907, + "num_tokens": 6945507.0, + "step": 1720 + }, + { + "entropy": 1.1808736383914948, + "epoch": 0.5089732274198293, + "grad_norm": 0.36107468605041504, + "learning_rate": 0.00019228679477192534, + "loss": 1.1965153694152832, + "mean_token_accuracy": 0.7326464653015137, + "num_tokens": 6985888.0, + "step": 1730 + }, + { + "entropy": 1.360763430595398, + "epoch": 0.5119152691968226, + "grad_norm": 0.4324724078178406, + "learning_rate": 0.0001921614124557366, + "loss": 1.3564807891845703, + "mean_token_accuracy": 0.7025493502616882, + "num_tokens": 7026252.0, + "step": 1740 + }, + { + "entropy": 1.2095451593399047, + "epoch": 0.5148573109738158, + "grad_norm": 0.39905378222465515, + "learning_rate": 0.000192035060813491, + "loss": 1.267704391479492, + "mean_token_accuracy": 0.7238605260848999, + "num_tokens": 7066705.0, + "step": 1750 + }, + { + "entropy": 1.379474115371704, + "epoch": 0.517799352750809, + "grad_norm": 0.44654110074043274, + "learning_rate": 0.00019190774117411717, + "loss": 1.352774715423584, + "mean_token_accuracy": 0.6996113300323487, + "num_tokens": 7106781.0, + "step": 1760 + }, + { + "entropy": 1.2537951588630676, + "epoch": 0.5207413945278023, + "grad_norm": 0.4148572087287903, + "learning_rate": 0.00019177945487672482, + "loss": 1.3051136970520019, + "mean_token_accuracy": 0.708976149559021, + "num_tokens": 7147220.0, + "step": 1770 + }, + { + "entropy": 1.2442652940750123, + "epoch": 0.5236834363047955, + "grad_norm": 0.45965835452079773, + "learning_rate": 0.00019165020327059073, + "loss": 1.2358969688415526, + "mean_token_accuracy": 0.7264375448226928, + "num_tokens": 7187657.0, + "step": 1780 + }, + { + "entropy": 1.2980039596557618, + "epoch": 0.5266254780817887, + "grad_norm": 0.4910111725330353, + "learning_rate": 0.00019151998771514442, + "loss": 1.3084582328796386, + "mean_token_accuracy": 0.7066324114799499, + "num_tokens": 7228181.0, + "step": 1790 + }, + { + "entropy": 1.2434914827346801, + "epoch": 0.529567519858782, + "grad_norm": 0.3658753037452698, + "learning_rate": 0.0001913888095799541, + "loss": 1.2698859214782714, + "mean_token_accuracy": 0.7207267701625824, + "num_tokens": 7268538.0, + "step": 1800 + }, + { + "epoch": 0.529567519858782, + "eval_entropy": 1.2639452193978578, + "eval_loss": 1.281688928604126, + "eval_mean_token_accuracy": 0.7168223152323345, + "eval_num_tokens": 7268538.0, + "eval_runtime": 116.9626, + "eval_samples_per_second": 26.034, + "eval_steps_per_second": 3.257, + "step": 1800 + }, + { + "entropy": 1.3006227135658264, + "epoch": 0.5325095616357752, + "grad_norm": 0.4497034549713135, + "learning_rate": 0.00019125667024471186, + "loss": 1.3108736991882324, + "mean_token_accuracy": 0.7042722702026367, + "num_tokens": 7308724.0, + "step": 1810 + }, + { + "entropy": 1.275818121433258, + "epoch": 0.5354516034127684, + "grad_norm": 0.4820014536380768, + "learning_rate": 0.00019112357109921964, + "loss": 1.2958525657653808, + "mean_token_accuracy": 0.7129449069499969, + "num_tokens": 7349172.0, + "step": 1820 + }, + { + "entropy": 1.245762574672699, + "epoch": 0.5383936451897617, + "grad_norm": 0.45138782262802124, + "learning_rate": 0.00019098951354337422, + "loss": 1.2449359893798828, + "mean_token_accuracy": 0.7270717859268189, + "num_tokens": 7389600.0, + "step": 1830 + }, + { + "entropy": 1.1886737942695618, + "epoch": 0.5413356869667549, + "grad_norm": 0.42880910634994507, + "learning_rate": 0.00019085449898715274, + "loss": 1.2165825843811036, + "mean_token_accuracy": 0.7328237056732178, + "num_tokens": 7429786.0, + "step": 1840 + }, + { + "entropy": 1.261668312549591, + "epoch": 0.5442777287437481, + "grad_norm": 0.41189083456993103, + "learning_rate": 0.0001907185288505978, + "loss": 1.2767746925354004, + "mean_token_accuracy": 0.718351137638092, + "num_tokens": 7470252.0, + "step": 1850 + }, + { + "entropy": 1.2039696455001831, + "epoch": 0.5472197705207414, + "grad_norm": 0.384387344121933, + "learning_rate": 0.0001905816045638024, + "loss": 1.2115536689758302, + "mean_token_accuracy": 0.7361586034297943, + "num_tokens": 7510473.0, + "step": 1860 + }, + { + "entropy": 1.2600649833679198, + "epoch": 0.5501618122977346, + "grad_norm": 0.4238559901714325, + "learning_rate": 0.00019044372756689504, + "loss": 1.2758872985839844, + "mean_token_accuracy": 0.7137023746967316, + "num_tokens": 7550932.0, + "step": 1870 + }, + { + "entropy": 1.3069803357124328, + "epoch": 0.5531038540747278, + "grad_norm": 0.44645845890045166, + "learning_rate": 0.00019030489931002461, + "loss": 1.3207664489746094, + "mean_token_accuracy": 0.7104817628860474, + "num_tokens": 7591311.0, + "step": 1880 + }, + { + "entropy": 1.3120111107826233, + "epoch": 0.556045895851721, + "grad_norm": 0.46638140082359314, + "learning_rate": 0.00019016512125334502, + "loss": 1.3248316764831543, + "mean_token_accuracy": 0.705955958366394, + "num_tokens": 7631397.0, + "step": 1890 + }, + { + "entropy": 1.279408586025238, + "epoch": 0.5589879376287143, + "grad_norm": 0.4149499535560608, + "learning_rate": 0.00019002439486699987, + "loss": 1.2773524284362794, + "mean_token_accuracy": 0.7147800862789154, + "num_tokens": 7671968.0, + "step": 1900 + }, + { + "epoch": 0.5589879376287143, + "eval_entropy": 1.2283503402249394, + "eval_loss": 1.2789214849472046, + "eval_mean_token_accuracy": 0.7170038914743058, + "eval_num_tokens": 7671968.0, + "eval_runtime": 117.0781, + "eval_samples_per_second": 26.008, + "eval_steps_per_second": 3.254, + "step": 1900 + }, + { + "entropy": 1.230725634098053, + "epoch": 0.5619299794057075, + "grad_norm": 0.4699569046497345, + "learning_rate": 0.00018988272163110703, + "loss": 1.2412220001220704, + "mean_token_accuracy": 0.7230030238628388, + "num_tokens": 7712399.0, + "step": 1910 + }, + { + "entropy": 1.3014813184738159, + "epoch": 0.5648720211827007, + "grad_norm": 0.4014744162559509, + "learning_rate": 0.0001897401030357431, + "loss": 1.3222554206848145, + "mean_token_accuracy": 0.7079461216926575, + "num_tokens": 7752805.0, + "step": 1920 + }, + { + "entropy": 1.2195597231388091, + "epoch": 0.567814062959694, + "grad_norm": 0.4482584297657013, + "learning_rate": 0.00018959654058092753, + "loss": 1.2318772315979003, + "mean_token_accuracy": 0.7294041752815247, + "num_tokens": 7793144.0, + "step": 1930 + }, + { + "entropy": 1.1877473592758179, + "epoch": 0.5707561047366873, + "grad_norm": 0.44251689314842224, + "learning_rate": 0.00018945203577660718, + "loss": 1.2138689041137696, + "mean_token_accuracy": 0.7385743677616119, + "num_tokens": 7833447.0, + "step": 1940 + }, + { + "entropy": 1.289076042175293, + "epoch": 0.5736981465136805, + "grad_norm": 0.39147278666496277, + "learning_rate": 0.00018930659014264017, + "loss": 1.2974119186401367, + "mean_token_accuracy": 0.712678074836731, + "num_tokens": 7873974.0, + "step": 1950 + }, + { + "entropy": 1.208195787668228, + "epoch": 0.5766401882906738, + "grad_norm": 0.5337726473808289, + "learning_rate": 0.00018916020520877994, + "loss": 1.2156153678894044, + "mean_token_accuracy": 0.726024466753006, + "num_tokens": 7914528.0, + "step": 1960 + }, + { + "entropy": 1.2428096413612366, + "epoch": 0.579582230067667, + "grad_norm": 0.41402509808540344, + "learning_rate": 0.00018901288251465937, + "loss": 1.2540960311889648, + "mean_token_accuracy": 0.7271076798439026, + "num_tokens": 7954995.0, + "step": 1970 + }, + { + "entropy": 1.3336944222450255, + "epoch": 0.5825242718446602, + "grad_norm": 0.47286257147789, + "learning_rate": 0.00018886462360977418, + "loss": 1.3511391639709474, + "mean_token_accuracy": 0.7000328660011291, + "num_tokens": 7995173.0, + "step": 1980 + }, + { + "entropy": 1.210561752319336, + "epoch": 0.5854663136216535, + "grad_norm": 0.4485618472099304, + "learning_rate": 0.00018871543005346712, + "loss": 1.222616958618164, + "mean_token_accuracy": 0.7286027550697327, + "num_tokens": 8035513.0, + "step": 1990 + }, + { + "entropy": 1.2922529458999634, + "epoch": 0.5884083553986467, + "grad_norm": 0.4895835220813751, + "learning_rate": 0.0001885653034149111, + "loss": 1.3031671524047852, + "mean_token_accuracy": 0.7130830705165863, + "num_tokens": 8075879.0, + "step": 2000 + }, + { + "epoch": 0.5884083553986467, + "eval_entropy": 1.2732777583004609, + "eval_loss": 1.2699540853500366, + "eval_mean_token_accuracy": 0.7192083870957843, + "eval_num_tokens": 8075879.0, + "eval_runtime": 116.9367, + "eval_samples_per_second": 26.04, + "eval_steps_per_second": 3.258, + "step": 2000 + }, + { + "entropy": 1.2049331665039062, + "epoch": 0.5913503971756399, + "grad_norm": 0.42137858271598816, + "learning_rate": 0.00018841424527309312, + "loss": 1.220776081085205, + "mean_token_accuracy": 0.7255268514156341, + "num_tokens": 8116494.0, + "step": 2010 + }, + { + "entropy": 1.2747172474861146, + "epoch": 0.5942924389526332, + "grad_norm": 0.47128528356552124, + "learning_rate": 0.0001882622572167973, + "loss": 1.2953474044799804, + "mean_token_accuracy": 0.7114850461483002, + "num_tokens": 8156917.0, + "step": 2020 + }, + { + "entropy": 1.2040826320648192, + "epoch": 0.5972344807296264, + "grad_norm": 0.477896124124527, + "learning_rate": 0.0001881093408445884, + "loss": 1.2177928924560546, + "mean_token_accuracy": 0.7306720972061157, + "num_tokens": 8197335.0, + "step": 2030 + }, + { + "entropy": 1.2654430389404296, + "epoch": 0.6001765225066196, + "grad_norm": 0.5165997743606567, + "learning_rate": 0.00018795549776479478, + "loss": 1.275616455078125, + "mean_token_accuracy": 0.7176172435283661, + "num_tokens": 8237564.0, + "step": 2040 + }, + { + "entropy": 1.2124343276023866, + "epoch": 0.6031185642836129, + "grad_norm": 0.47672176361083984, + "learning_rate": 0.0001878007295954919, + "loss": 1.2235189437866212, + "mean_token_accuracy": 0.7277295291423798, + "num_tokens": 8277586.0, + "step": 2050 + }, + { + "entropy": 1.2670048475265503, + "epoch": 0.6060606060606061, + "grad_norm": 0.42278149724006653, + "learning_rate": 0.00018764503796448478, + "loss": 1.284127426147461, + "mean_token_accuracy": 0.7204706132411957, + "num_tokens": 8318120.0, + "step": 2060 + }, + { + "entropy": 1.2286874294281005, + "epoch": 0.6090026478375993, + "grad_norm": 0.4719674587249756, + "learning_rate": 0.0001874884245092913, + "loss": 1.2510007858276366, + "mean_token_accuracy": 0.7220764279365539, + "num_tokens": 8358689.0, + "step": 2070 + }, + { + "entropy": 1.298528516292572, + "epoch": 0.6119446896145925, + "grad_norm": 0.46403807401657104, + "learning_rate": 0.00018733089087712469, + "loss": 1.2856470108032227, + "mean_token_accuracy": 0.7156751930713654, + "num_tokens": 8399274.0, + "step": 2080 + }, + { + "entropy": 1.268327033519745, + "epoch": 0.6148867313915858, + "grad_norm": 0.4383772611618042, + "learning_rate": 0.00018717243872487643, + "loss": 1.3167724609375, + "mean_token_accuracy": 0.7053386807441712, + "num_tokens": 8439732.0, + "step": 2090 + }, + { + "entropy": 1.3164120435714721, + "epoch": 0.617828773168579, + "grad_norm": 0.5242652893066406, + "learning_rate": 0.00018701306971909864, + "loss": 1.3042461395263671, + "mean_token_accuracy": 0.7102492094039917, + "num_tokens": 8479716.0, + "step": 2100 + }, + { + "epoch": 0.617828773168579, + "eval_entropy": 1.2351457424989836, + "eval_loss": 1.2649626731872559, + "eval_mean_token_accuracy": 0.7205018502833649, + "eval_num_tokens": 8479716.0, + "eval_runtime": 117.0816, + "eval_samples_per_second": 26.008, + "eval_steps_per_second": 3.254, + "step": 2100 + }, + { + "entropy": 1.2392801344394684, + "epoch": 0.6207708149455722, + "grad_norm": 0.47463512420654297, + "learning_rate": 0.00018685278553598665, + "loss": 1.2612558364868165, + "mean_token_accuracy": 0.7227189362049102, + "num_tokens": 8520008.0, + "step": 2110 + }, + { + "entropy": 1.2834307312965394, + "epoch": 0.6237128567225655, + "grad_norm": 0.4995603859424591, + "learning_rate": 0.0001866915878613614, + "loss": 1.2821990966796875, + "mean_token_accuracy": 0.7140654444694519, + "num_tokens": 8560586.0, + "step": 2120 + }, + { + "entropy": 1.2452851295471192, + "epoch": 0.6266548984995587, + "grad_norm": 0.40243977308273315, + "learning_rate": 0.00018652947839065159, + "loss": 1.2768383026123047, + "mean_token_accuracy": 0.7194826602935791, + "num_tokens": 8601116.0, + "step": 2130 + }, + { + "entropy": 1.2355848252773285, + "epoch": 0.6295969402765519, + "grad_norm": 0.3945181369781494, + "learning_rate": 0.00018636645882887592, + "loss": 1.2494622230529786, + "mean_token_accuracy": 0.7192557215690613, + "num_tokens": 8641094.0, + "step": 2140 + }, + { + "entropy": 1.243088138103485, + "epoch": 0.6325389820535452, + "grad_norm": 0.43109068274497986, + "learning_rate": 0.0001862025308906252, + "loss": 1.2423103332519532, + "mean_token_accuracy": 0.7271890878677368, + "num_tokens": 8681458.0, + "step": 2150 + }, + { + "entropy": 1.2035149574279784, + "epoch": 0.6354810238305384, + "grad_norm": 0.4727862477302551, + "learning_rate": 0.0001860376963000443, + "loss": 1.2361689567565919, + "mean_token_accuracy": 0.7284120082855224, + "num_tokens": 8722066.0, + "step": 2160 + }, + { + "entropy": 1.2721379399299622, + "epoch": 0.6384230656075316, + "grad_norm": 0.5057058930397034, + "learning_rate": 0.00018587195679081386, + "loss": 1.2708622932434082, + "mean_token_accuracy": 0.7155144691467286, + "num_tokens": 8762672.0, + "step": 2170 + }, + { + "entropy": 1.2714126229286193, + "epoch": 0.6413651073845249, + "grad_norm": 0.4464847147464752, + "learning_rate": 0.0001857053141061323, + "loss": 1.2917202949523925, + "mean_token_accuracy": 0.7170847177505493, + "num_tokens": 8802819.0, + "step": 2180 + }, + { + "entropy": 1.2385571956634522, + "epoch": 0.6443071491615181, + "grad_norm": 0.4425857365131378, + "learning_rate": 0.00018553776999869737, + "loss": 1.252675437927246, + "mean_token_accuracy": 0.7212274014949799, + "num_tokens": 8842977.0, + "step": 2190 + }, + { + "entropy": 1.2919608235359192, + "epoch": 0.6472491909385113, + "grad_norm": 0.43156683444976807, + "learning_rate": 0.00018536932623068757, + "loss": 1.2964573860168458, + "mean_token_accuracy": 0.7152363657951355, + "num_tokens": 8883579.0, + "step": 2200 + }, + { + "epoch": 0.6472491909385113, + "eval_entropy": 1.2316402986919472, + "eval_loss": 1.2606068849563599, + "eval_mean_token_accuracy": 0.7208151789162103, + "eval_num_tokens": 8883579.0, + "eval_runtime": 117.0496, + "eval_samples_per_second": 26.015, + "eval_steps_per_second": 3.255, + "step": 2200 + }, + { + "entropy": 1.3093620181083678, + "epoch": 0.6501912327155046, + "grad_norm": 0.4637244641780853, + "learning_rate": 0.00018519998457374395, + "loss": 1.34036865234375, + "mean_token_accuracy": 0.7034331321716308, + "num_tokens": 8923903.0, + "step": 2210 + }, + { + "entropy": 1.214417290687561, + "epoch": 0.6531332744924978, + "grad_norm": 0.5183310508728027, + "learning_rate": 0.00018502974680895115, + "loss": 1.230905532836914, + "mean_token_accuracy": 0.7264467597007751, + "num_tokens": 8963848.0, + "step": 2220 + }, + { + "entropy": 1.2735732913017273, + "epoch": 0.656075316269491, + "grad_norm": 0.6038491725921631, + "learning_rate": 0.00018485861472681888, + "loss": 1.2594423294067383, + "mean_token_accuracy": 0.7218607306480408, + "num_tokens": 9003224.0, + "step": 2230 + }, + { + "entropy": 1.278822934627533, + "epoch": 0.6590173580464843, + "grad_norm": 0.47614696621894836, + "learning_rate": 0.00018468659012726301, + "loss": 1.3205299377441406, + "mean_token_accuracy": 0.7092170417308807, + "num_tokens": 9043662.0, + "step": 2240 + }, + { + "entropy": 1.2612668752670289, + "epoch": 0.6619593998234775, + "grad_norm": 0.4333754777908325, + "learning_rate": 0.00018451367481958655, + "loss": 1.269089412689209, + "mean_token_accuracy": 0.7188269674777985, + "num_tokens": 9084132.0, + "step": 2250 + }, + { + "entropy": 1.2136012673377992, + "epoch": 0.6649014416004707, + "grad_norm": 0.44086429476737976, + "learning_rate": 0.0001843398706224608, + "loss": 1.222397518157959, + "mean_token_accuracy": 0.7309954643249512, + "num_tokens": 9124313.0, + "step": 2260 + }, + { + "entropy": 1.2528056859970094, + "epoch": 0.6678434833774639, + "grad_norm": 0.45429208874702454, + "learning_rate": 0.0001841651793639061, + "loss": 1.2543872833251952, + "mean_token_accuracy": 0.7223168253898621, + "num_tokens": 9164680.0, + "step": 2270 + }, + { + "entropy": 1.2683658719062805, + "epoch": 0.6707855251544572, + "grad_norm": 0.4545646011829376, + "learning_rate": 0.00018398960288127264, + "loss": 1.3083954811096192, + "mean_token_accuracy": 0.7077820837497711, + "num_tokens": 9205179.0, + "step": 2280 + }, + { + "entropy": 1.267857301235199, + "epoch": 0.6737275669314504, + "grad_norm": 0.43586161732673645, + "learning_rate": 0.00018381314302122115, + "loss": 1.2671592712402344, + "mean_token_accuracy": 0.7227232694625855, + "num_tokens": 9245707.0, + "step": 2290 + }, + { + "entropy": 1.191249167919159, + "epoch": 0.6766696087084436, + "grad_norm": 0.5259418487548828, + "learning_rate": 0.00018363580163970343, + "loss": 1.1978882789611816, + "mean_token_accuracy": 0.7335732400417327, + "num_tokens": 9286200.0, + "step": 2300 + }, + { + "epoch": 0.6766696087084436, + "eval_entropy": 1.1682455519365826, + "eval_loss": 1.257144808769226, + "eval_mean_token_accuracy": 0.7223104308909318, + "eval_num_tokens": 9286200.0, + "eval_runtime": 116.9696, + "eval_samples_per_second": 26.032, + "eval_steps_per_second": 3.257, + "step": 2300 + }, + { + "entropy": 1.2716636419296266, + "epoch": 0.6796116504854369, + "grad_norm": 0.415056049823761, + "learning_rate": 0.00018345758060194287, + "loss": 1.2905259132385254, + "mean_token_accuracy": 0.710951566696167, + "num_tokens": 9326847.0, + "step": 2310 + }, + { + "entropy": 1.1874773681163788, + "epoch": 0.6825536922624301, + "grad_norm": 0.49722516536712646, + "learning_rate": 0.00018327848178241481, + "loss": 1.217663288116455, + "mean_token_accuracy": 0.7317953050136566, + "num_tokens": 9367084.0, + "step": 2320 + }, + { + "entropy": 1.2677314758300782, + "epoch": 0.6854957340394233, + "grad_norm": 0.4097212255001068, + "learning_rate": 0.00018309850706482687, + "loss": 1.2633783340454101, + "mean_token_accuracy": 0.7190278351306916, + "num_tokens": 9407189.0, + "step": 2330 + }, + { + "entropy": 1.2582902312278748, + "epoch": 0.6884377758164166, + "grad_norm": 0.40571051836013794, + "learning_rate": 0.00018291765834209907, + "loss": 1.2858672142028809, + "mean_token_accuracy": 0.7170897841453552, + "num_tokens": 9447880.0, + "step": 2340 + }, + { + "entropy": 1.327420747280121, + "epoch": 0.6913798175934098, + "grad_norm": 0.48363062739372253, + "learning_rate": 0.0001827359375163439, + "loss": 1.327678108215332, + "mean_token_accuracy": 0.7030242502689361, + "num_tokens": 9488502.0, + "step": 2350 + }, + { + "entropy": 1.1727963089942932, + "epoch": 0.694321859370403, + "grad_norm": 0.6001095175743103, + "learning_rate": 0.00018255334649884653, + "loss": 1.2080462455749512, + "mean_token_accuracy": 0.7356064558029175, + "num_tokens": 9529082.0, + "step": 2360 + }, + { + "entropy": 1.2679656863212585, + "epoch": 0.6972639011473963, + "grad_norm": 0.4376157522201538, + "learning_rate": 0.00018236988721004435, + "loss": 1.2510211944580079, + "mean_token_accuracy": 0.7209162175655365, + "num_tokens": 9569520.0, + "step": 2370 + }, + { + "entropy": 1.192594301700592, + "epoch": 0.7002059429243895, + "grad_norm": 0.4717561900615692, + "learning_rate": 0.00018218556157950712, + "loss": 1.2164905548095704, + "mean_token_accuracy": 0.7315115988254547, + "num_tokens": 9609903.0, + "step": 2380 + }, + { + "entropy": 1.3375505089759827, + "epoch": 0.7031479847013827, + "grad_norm": 0.5176673531532288, + "learning_rate": 0.00018200037154591643, + "loss": 1.3507174491882323, + "mean_token_accuracy": 0.69825981259346, + "num_tokens": 9650434.0, + "step": 2390 + }, + { + "entropy": 1.2428280234336853, + "epoch": 0.706090026478376, + "grad_norm": 0.4390230178833008, + "learning_rate": 0.00018181431905704546, + "loss": 1.254446029663086, + "mean_token_accuracy": 0.7244620621204376, + "num_tokens": 9690991.0, + "step": 2400 + }, + { + "epoch": 0.706090026478376, + "eval_entropy": 1.2351404309585652, + "eval_loss": 1.2472655773162842, + "eval_mean_token_accuracy": 0.7242527881006556, + "eval_num_tokens": 9690991.0, + "eval_runtime": 117.0968, + "eval_samples_per_second": 26.004, + "eval_steps_per_second": 3.254, + "step": 2400 + }, + { + "entropy": 1.1983890414237977, + "epoch": 0.7090320682553692, + "grad_norm": 0.44956910610198975, + "learning_rate": 0.0001816274060697384, + "loss": 1.2177659034729005, + "mean_token_accuracy": 0.731769061088562, + "num_tokens": 9731440.0, + "step": 2410 + }, + { + "entropy": 1.2803593873977661, + "epoch": 0.7119741100323624, + "grad_norm": 0.4463217854499817, + "learning_rate": 0.00018143963454988994, + "loss": 1.2735008239746093, + "mean_token_accuracy": 0.7187759101390838, + "num_tokens": 9771061.0, + "step": 2420 + }, + { + "entropy": 1.22296462059021, + "epoch": 0.7149161518093556, + "grad_norm": 0.4359455406665802, + "learning_rate": 0.0001812510064724245, + "loss": 1.260395622253418, + "mean_token_accuracy": 0.7190466344356536, + "num_tokens": 9811550.0, + "step": 2430 + }, + { + "entropy": 1.2306616604328156, + "epoch": 0.717858193586349, + "grad_norm": 0.4980376660823822, + "learning_rate": 0.0001810615238212755, + "loss": 1.227048110961914, + "mean_token_accuracy": 0.7294842720031738, + "num_tokens": 9852102.0, + "step": 2440 + }, + { + "entropy": 1.2669499397277832, + "epoch": 0.7208002353633421, + "grad_norm": 0.4520755708217621, + "learning_rate": 0.00018087118858936462, + "loss": 1.2932658195495605, + "mean_token_accuracy": 0.7133265674114228, + "num_tokens": 9892746.0, + "step": 2450 + }, + { + "entropy": 1.2530406713485718, + "epoch": 0.7237422771403353, + "grad_norm": 0.6179884076118469, + "learning_rate": 0.00018068000277858065, + "loss": 1.2789620399475097, + "mean_token_accuracy": 0.7135308802127838, + "num_tokens": 9933185.0, + "step": 2460 + }, + { + "entropy": 1.2204147577285767, + "epoch": 0.7266843189173287, + "grad_norm": 0.4976007044315338, + "learning_rate": 0.00018048796839975856, + "loss": 1.2141535758972168, + "mean_token_accuracy": 0.7293932437896729, + "num_tokens": 9973384.0, + "step": 2470 + }, + { + "entropy": 1.1704169631004333, + "epoch": 0.7296263606943219, + "grad_norm": 0.44038140773773193, + "learning_rate": 0.0001802950874726582, + "loss": 1.1908206939697266, + "mean_token_accuracy": 0.7376551747322082, + "num_tokens": 10013764.0, + "step": 2480 + }, + { + "entropy": 1.2523088693618774, + "epoch": 0.732568402471315, + "grad_norm": 0.46554332971572876, + "learning_rate": 0.00018010136202594332, + "loss": 1.2656194686889648, + "mean_token_accuracy": 0.7242594540119172, + "num_tokens": 10054063.0, + "step": 2490 + }, + { + "entropy": 1.3448559761047363, + "epoch": 0.7355104442483084, + "grad_norm": 0.4372340142726898, + "learning_rate": 0.00017990679409715993, + "loss": 1.3519328117370606, + "mean_token_accuracy": 0.698470801115036, + "num_tokens": 10094687.0, + "step": 2500 + }, + { + "epoch": 0.7355104442483084, + "eval_entropy": 1.2676614907782848, + "eval_loss": 1.2452576160430908, + "eval_mean_token_accuracy": 0.7236289200507436, + "eval_num_tokens": 10094687.0, + "eval_runtime": 116.8804, + "eval_samples_per_second": 26.052, + "eval_steps_per_second": 3.26, + "step": 2500 + }, + { + "entropy": 1.2084831714630127, + "epoch": 0.7384524860253016, + "grad_norm": 0.4193192422389984, + "learning_rate": 0.00017971138573271507, + "loss": 1.218832778930664, + "mean_token_accuracy": 0.7289343297481536, + "num_tokens": 10135026.0, + "step": 2510 + }, + { + "entropy": 1.1639327347278594, + "epoch": 0.7413945278022948, + "grad_norm": 0.48731788992881775, + "learning_rate": 0.0001795151389878552, + "loss": 1.1885252952575684, + "mean_token_accuracy": 0.7402738213539124, + "num_tokens": 10175310.0, + "step": 2520 + }, + { + "entropy": 1.2578992068767547, + "epoch": 0.7443365695792881, + "grad_norm": 0.434038370847702, + "learning_rate": 0.00017931805592664472, + "loss": 1.26021728515625, + "mean_token_accuracy": 0.7184948623180389, + "num_tokens": 10215765.0, + "step": 2530 + }, + { + "entropy": 1.2038448989391326, + "epoch": 0.7472786113562813, + "grad_norm": 0.48660168051719666, + "learning_rate": 0.00017912013862194404, + "loss": 1.211390781402588, + "mean_token_accuracy": 0.7284208476543427, + "num_tokens": 10256371.0, + "step": 2540 + }, + { + "entropy": 1.1407162606716157, + "epoch": 0.7502206531332745, + "grad_norm": 0.5016793608665466, + "learning_rate": 0.0001789213891553879, + "loss": 1.1628236770629883, + "mean_token_accuracy": 0.7418089389801026, + "num_tokens": 10296590.0, + "step": 2550 + }, + { + "entropy": 1.2328106760978699, + "epoch": 0.7531626949102678, + "grad_norm": 0.42070743441581726, + "learning_rate": 0.00017872180961736356, + "loss": 1.245603656768799, + "mean_token_accuracy": 0.7262676537036896, + "num_tokens": 10337069.0, + "step": 2560 + }, + { + "entropy": 1.2932706713676452, + "epoch": 0.756104736687261, + "grad_norm": 0.46600809693336487, + "learning_rate": 0.00017852140210698858, + "loss": 1.2860488891601562, + "mean_token_accuracy": 0.7088023841381073, + "num_tokens": 10377428.0, + "step": 2570 + }, + { + "entropy": 1.147745430469513, + "epoch": 0.7590467784642542, + "grad_norm": 0.4609155058860779, + "learning_rate": 0.00017832016873208905, + "loss": 1.1787425994873046, + "mean_token_accuracy": 0.7361489832401276, + "num_tokens": 10417739.0, + "step": 2580 + }, + { + "entropy": 1.2378079295158386, + "epoch": 0.7619888202412475, + "grad_norm": 0.4880058467388153, + "learning_rate": 0.00017811811160917712, + "loss": 1.2569812774658202, + "mean_token_accuracy": 0.7204902648925782, + "num_tokens": 10458215.0, + "step": 2590 + }, + { + "entropy": 1.2285701274871825, + "epoch": 0.7649308620182407, + "grad_norm": 0.5326588153839111, + "learning_rate": 0.0001779152328634289, + "loss": 1.221341609954834, + "mean_token_accuracy": 0.7287818729877472, + "num_tokens": 10498647.0, + "step": 2600 + }, + { + "epoch": 0.7649308620182407, + "eval_entropy": 1.2003295410649357, + "eval_loss": 1.236678957939148, + "eval_mean_token_accuracy": 0.7259835525760501, + "eval_num_tokens": 10498647.0, + "eval_runtime": 116.9331, + "eval_samples_per_second": 26.041, + "eval_steps_per_second": 3.258, + "step": 2600 + }, + { + "entropy": 1.2428824484348298, + "epoch": 0.7678729037952339, + "grad_norm": 0.45255497097969055, + "learning_rate": 0.00017771153462866216, + "loss": 1.2709949493408204, + "mean_token_accuracy": 0.7177605211734772, + "num_tokens": 10539009.0, + "step": 2610 + }, + { + "entropy": 1.277386212348938, + "epoch": 0.7708149455722271, + "grad_norm": 0.48587530851364136, + "learning_rate": 0.00017750701904731373, + "loss": 1.2778440475463868, + "mean_token_accuracy": 0.7140256285667419, + "num_tokens": 10579502.0, + "step": 2620 + }, + { + "entropy": 1.2311038613319396, + "epoch": 0.7737569873492204, + "grad_norm": 0.6130372285842896, + "learning_rate": 0.00017730168827041708, + "loss": 1.2585201263427734, + "mean_token_accuracy": 0.7184097468852997, + "num_tokens": 10619385.0, + "step": 2630 + }, + { + "entropy": 1.2518787860870362, + "epoch": 0.7766990291262136, + "grad_norm": 0.43607237935066223, + "learning_rate": 0.00017709554445757966, + "loss": 1.2394478797912598, + "mean_token_accuracy": 0.726053637266159, + "num_tokens": 10659852.0, + "step": 2640 + }, + { + "entropy": 1.2289941668510438, + "epoch": 0.7796410709032068, + "grad_norm": 0.4687124788761139, + "learning_rate": 0.00017688858977696014, + "loss": 1.2466455459594727, + "mean_token_accuracy": 0.7200910389423371, + "num_tokens": 10700123.0, + "step": 2650 + }, + { + "entropy": 1.232028889656067, + "epoch": 0.7825831126802001, + "grad_norm": 0.45022356510162354, + "learning_rate": 0.00017668082640524574, + "loss": 1.2503207206726075, + "mean_token_accuracy": 0.7186446607112884, + "num_tokens": 10740503.0, + "step": 2660 + }, + { + "entropy": 1.204378592967987, + "epoch": 0.7855251544571933, + "grad_norm": 0.41389408707618713, + "learning_rate": 0.0001764722565276292, + "loss": 1.211115264892578, + "mean_token_accuracy": 0.7328132688999176, + "num_tokens": 10780655.0, + "step": 2670 + }, + { + "entropy": 1.2730337023735045, + "epoch": 0.7884671962341865, + "grad_norm": 0.4794485569000244, + "learning_rate": 0.00017626288233778582, + "loss": 1.2866595268249512, + "mean_token_accuracy": 0.7171245098114014, + "num_tokens": 10821171.0, + "step": 2680 + }, + { + "entropy": 1.2131190776824952, + "epoch": 0.7914092380111798, + "grad_norm": 0.4519226551055908, + "learning_rate": 0.00017605270603785047, + "loss": 1.228813934326172, + "mean_token_accuracy": 0.7272532522678375, + "num_tokens": 10861594.0, + "step": 2690 + }, + { + "entropy": 1.2511601805686952, + "epoch": 0.794351279788173, + "grad_norm": 0.4799201488494873, + "learning_rate": 0.00017584172983839435, + "loss": 1.2575819969177247, + "mean_token_accuracy": 0.7226161539554596, + "num_tokens": 10902043.0, + "step": 2700 + }, + { + "epoch": 0.794351279788173, + "eval_entropy": 1.2509275682642078, + "eval_loss": 1.2339112758636475, + "eval_mean_token_accuracy": 0.7265911623248904, + "eval_num_tokens": 10902043.0, + "eval_runtime": 116.9442, + "eval_samples_per_second": 26.038, + "eval_steps_per_second": 3.258, + "step": 2700 + }, + { + "entropy": 1.2342944502830506, + "epoch": 0.7972933215651662, + "grad_norm": 0.49187448620796204, + "learning_rate": 0.00017562995595840178, + "loss": 1.2416543006896972, + "mean_token_accuracy": 0.7259420096874237, + "num_tokens": 10942392.0, + "step": 2710 + }, + { + "entropy": 1.1988507807254791, + "epoch": 0.8002353633421595, + "grad_norm": 0.4817524254322052, + "learning_rate": 0.00017541738662524677, + "loss": 1.2237167358398438, + "mean_token_accuracy": 0.7307404637336731, + "num_tokens": 10982924.0, + "step": 2720 + }, + { + "entropy": 1.236850619316101, + "epoch": 0.8031774051191527, + "grad_norm": 0.4729112684726715, + "learning_rate": 0.0001752040240746698, + "loss": 1.2266542434692382, + "mean_token_accuracy": 0.7280332922935486, + "num_tokens": 11023305.0, + "step": 2730 + }, + { + "entropy": 1.1383702993392943, + "epoch": 0.8061194468961459, + "grad_norm": 0.42324507236480713, + "learning_rate": 0.00017498987055075403, + "loss": 1.1699549674987793, + "mean_token_accuracy": 0.7414192616939544, + "num_tokens": 11063084.0, + "step": 2740 + }, + { + "entropy": 1.1376792788505554, + "epoch": 0.8090614886731392, + "grad_norm": 0.5435130000114441, + "learning_rate": 0.00017477492830590192, + "loss": 1.1505720138549804, + "mean_token_accuracy": 0.7437731683254242, + "num_tokens": 11103518.0, + "step": 2750 + }, + { + "entropy": 1.2045920014381408, + "epoch": 0.8120035304501324, + "grad_norm": 0.4564155340194702, + "learning_rate": 0.00017455919960081149, + "loss": 1.1978718757629394, + "mean_token_accuracy": 0.7353939712047577, + "num_tokens": 11143802.0, + "step": 2760 + }, + { + "entropy": 1.197198224067688, + "epoch": 0.8149455722271256, + "grad_norm": 0.457720011472702, + "learning_rate": 0.0001743426867044524, + "loss": 1.2358501434326172, + "mean_token_accuracy": 0.7267086863517761, + "num_tokens": 11184158.0, + "step": 2770 + }, + { + "entropy": 1.2718565106391906, + "epoch": 0.8178876140041189, + "grad_norm": 0.4250863194465637, + "learning_rate": 0.00017412539189404233, + "loss": 1.2535717010498046, + "mean_token_accuracy": 0.7206644594669342, + "num_tokens": 11224574.0, + "step": 2780 + }, + { + "entropy": 1.172040694952011, + "epoch": 0.8208296557811121, + "grad_norm": 0.5394912958145142, + "learning_rate": 0.00017390731745502283, + "loss": 1.2080710411071778, + "mean_token_accuracy": 0.7338366210460663, + "num_tokens": 11265027.0, + "step": 2790 + }, + { + "entropy": 1.187865948677063, + "epoch": 0.8237716975581053, + "grad_norm": 0.49136948585510254, + "learning_rate": 0.00017368846568103529, + "loss": 1.1811614990234376, + "mean_token_accuracy": 0.7365565001964569, + "num_tokens": 11305432.0, + "step": 2800 + }, + { + "epoch": 0.8237716975581053, + "eval_entropy": 1.2183332227346466, + "eval_loss": 1.2257474660873413, + "eval_mean_token_accuracy": 0.7285947153574526, + "eval_num_tokens": 11305432.0, + "eval_runtime": 116.8432, + "eval_samples_per_second": 26.061, + "eval_steps_per_second": 3.261, + "step": 2800 + }, + { + "entropy": 1.1511970162391663, + "epoch": 0.8267137393350985, + "grad_norm": 0.487531840801239, + "learning_rate": 0.00017346883887389702, + "loss": 1.1708711624145507, + "mean_token_accuracy": 0.7420612633228302, + "num_tokens": 11345702.0, + "step": 2810 + }, + { + "entropy": 1.2634961485862732, + "epoch": 0.8296557811120918, + "grad_norm": 0.4640207886695862, + "learning_rate": 0.00017324843934357674, + "loss": 1.277150821685791, + "mean_token_accuracy": 0.7138958215713501, + "num_tokens": 11386240.0, + "step": 2820 + }, + { + "entropy": 1.1891654789447785, + "epoch": 0.832597822889085, + "grad_norm": 0.47752827405929565, + "learning_rate": 0.0001730272694081706, + "loss": 1.193849754333496, + "mean_token_accuracy": 0.7351171731948852, + "num_tokens": 11426663.0, + "step": 2830 + }, + { + "entropy": 1.2108049154281617, + "epoch": 0.8355398646660782, + "grad_norm": 0.514695942401886, + "learning_rate": 0.0001728053313938775, + "loss": 1.2478459358215332, + "mean_token_accuracy": 0.721454119682312, + "num_tokens": 11467195.0, + "step": 2840 + }, + { + "entropy": 1.2818018913269043, + "epoch": 0.8384819064430715, + "grad_norm": 0.517238438129425, + "learning_rate": 0.00017258262763497482, + "loss": 1.2610112190246583, + "mean_token_accuracy": 0.7156127452850342, + "num_tokens": 11507742.0, + "step": 2850 + }, + { + "entropy": 1.1540677964687347, + "epoch": 0.8414239482200647, + "grad_norm": 0.4621984362602234, + "learning_rate": 0.00017235916047379383, + "loss": 1.187222385406494, + "mean_token_accuracy": 0.7345345914363861, + "num_tokens": 11548215.0, + "step": 2860 + }, + { + "entropy": 1.2128564953804015, + "epoch": 0.8443659899970579, + "grad_norm": 0.4317253530025482, + "learning_rate": 0.000172134932260695, + "loss": 1.2142438888549805, + "mean_token_accuracy": 0.7298630118370056, + "num_tokens": 11588711.0, + "step": 2870 + }, + { + "entropy": 1.1962445259094239, + "epoch": 0.8473080317740512, + "grad_norm": 0.6231359839439392, + "learning_rate": 0.00017190994535404332, + "loss": 1.221367359161377, + "mean_token_accuracy": 0.7313773334026337, + "num_tokens": 11629162.0, + "step": 2880 + }, + { + "entropy": 1.2900652885437012, + "epoch": 0.8502500735510444, + "grad_norm": 0.4637579321861267, + "learning_rate": 0.00017168420212018354, + "loss": 1.2854097366333008, + "mean_token_accuracy": 0.7109606087207794, + "num_tokens": 11669650.0, + "step": 2890 + }, + { + "entropy": 1.2192873358726501, + "epoch": 0.8531921153280376, + "grad_norm": 0.4562380909919739, + "learning_rate": 0.00017145770493341518, + "loss": 1.2436570167541503, + "mean_token_accuracy": 0.7263434827327728, + "num_tokens": 11710292.0, + "step": 2900 + }, + { + "epoch": 0.8531921153280376, + "eval_entropy": 1.2169157832939168, + "eval_loss": 1.219910979270935, + "eval_mean_token_accuracy": 0.7299123456471861, + "eval_num_tokens": 11710292.0, + "eval_runtime": 116.9129, + "eval_samples_per_second": 26.045, + "eval_steps_per_second": 3.259, + "step": 2900 + }, + { + "entropy": 1.2365688323974608, + "epoch": 0.8561341571050309, + "grad_norm": 0.46011191606521606, + "learning_rate": 0.00017123045617596763, + "loss": 1.2509427070617676, + "mean_token_accuracy": 0.7250486254692078, + "num_tokens": 11750649.0, + "step": 2910 + }, + { + "entropy": 1.2337135434150697, + "epoch": 0.8590761988820241, + "grad_norm": 0.5224452018737793, + "learning_rate": 0.00017100245823797503, + "loss": 1.2394111633300782, + "mean_token_accuracy": 0.729231595993042, + "num_tokens": 11791097.0, + "step": 2920 + }, + { + "entropy": 1.2292932152748108, + "epoch": 0.8620182406590173, + "grad_norm": 0.48644399642944336, + "learning_rate": 0.00017077371351745124, + "loss": 1.2508816719055176, + "mean_token_accuracy": 0.718151307106018, + "num_tokens": 11831392.0, + "step": 2930 + }, + { + "entropy": 1.1478191256523131, + "epoch": 0.8649602824360106, + "grad_norm": 0.4293639063835144, + "learning_rate": 0.00017054422442026456, + "loss": 1.1457470893859862, + "mean_token_accuracy": 0.7468528985977173, + "num_tokens": 11871925.0, + "step": 2940 + }, + { + "entropy": 1.2405227303504944, + "epoch": 0.8679023242130038, + "grad_norm": 0.6136884689331055, + "learning_rate": 0.00017031399336011238, + "loss": 1.2617270469665527, + "mean_token_accuracy": 0.716576772928238, + "num_tokens": 11912605.0, + "step": 2950 + }, + { + "entropy": 1.2667588710784912, + "epoch": 0.870844365989997, + "grad_norm": 0.4269845485687256, + "learning_rate": 0.00017008302275849582, + "loss": 1.283921241760254, + "mean_token_accuracy": 0.7173857808113098, + "num_tokens": 11953010.0, + "step": 2960 + }, + { + "entropy": 1.3015403628349305, + "epoch": 0.8737864077669902, + "grad_norm": 0.46401771903038025, + "learning_rate": 0.0001698513150446943, + "loss": 1.3019817352294922, + "mean_token_accuracy": 0.7130701661109924, + "num_tokens": 11993545.0, + "step": 2970 + }, + { + "entropy": 1.2569140315055847, + "epoch": 0.8767284495439835, + "grad_norm": 0.47312793135643005, + "learning_rate": 0.00016961887265574, + "loss": 1.279769515991211, + "mean_token_accuracy": 0.7160746216773987, + "num_tokens": 12033961.0, + "step": 2980 + }, + { + "entropy": 1.253390657901764, + "epoch": 0.8796704913209767, + "grad_norm": 0.5449343323707581, + "learning_rate": 0.0001693856980363921, + "loss": 1.259514045715332, + "mean_token_accuracy": 0.7175013661384583, + "num_tokens": 12074553.0, + "step": 2990 + }, + { + "entropy": 1.3062225699424743, + "epoch": 0.8826125330979699, + "grad_norm": 0.4499056041240692, + "learning_rate": 0.00016915179363911125, + "loss": 1.3181246757507323, + "mean_token_accuracy": 0.7042843997478485, + "num_tokens": 12115075.0, + "step": 3000 + }, + { + "epoch": 0.8826125330979699, + "eval_entropy": 1.2066716964789264, + "eval_loss": 1.2160207033157349, + "eval_mean_token_accuracy": 0.7302276686107706, + "eval_num_tokens": 12115075.0, + "eval_runtime": 117.081, + "eval_samples_per_second": 26.008, + "eval_steps_per_second": 3.254, + "step": 3000 + }, + { + "entropy": 1.1849471926689148, + "epoch": 0.8855545748749633, + "grad_norm": 0.45680174231529236, + "learning_rate": 0.00016891716192403365, + "loss": 1.18974027633667, + "mean_token_accuracy": 0.735182011127472, + "num_tokens": 12155441.0, + "step": 3010 + }, + { + "entropy": 1.218670165538788, + "epoch": 0.8884966166519565, + "grad_norm": 0.4255404770374298, + "learning_rate": 0.0001686818053589452, + "loss": 1.2337156295776368, + "mean_token_accuracy": 0.7265771627426147, + "num_tokens": 12196036.0, + "step": 3020 + }, + { + "entropy": 1.2351657152175903, + "epoch": 0.8914386584289496, + "grad_norm": 0.49140819907188416, + "learning_rate": 0.0001684457264192556, + "loss": 1.2371573448181152, + "mean_token_accuracy": 0.7276750206947327, + "num_tokens": 12236509.0, + "step": 3030 + }, + { + "entropy": 1.1386435210704804, + "epoch": 0.894380700205943, + "grad_norm": 0.4721025824546814, + "learning_rate": 0.00016820892758797218, + "loss": 1.1611692428588867, + "mean_token_accuracy": 0.7427328288555145, + "num_tokens": 12276991.0, + "step": 3040 + }, + { + "entropy": 1.2254295825958252, + "epoch": 0.8973227419829362, + "grad_norm": 0.5237665772438049, + "learning_rate": 0.000167971411355674, + "loss": 1.2355279922485352, + "mean_token_accuracy": 0.7242857456207276, + "num_tokens": 12317371.0, + "step": 3050 + }, + { + "entropy": 1.2108076691627503, + "epoch": 0.9002647837599294, + "grad_norm": 0.46400943398475647, + "learning_rate": 0.00016773318022048536, + "loss": 1.210099983215332, + "mean_token_accuracy": 0.7319884955883026, + "num_tokens": 12357779.0, + "step": 3060 + }, + { + "entropy": 1.1315430402755737, + "epoch": 0.9032068255369227, + "grad_norm": 0.46516865491867065, + "learning_rate": 0.00016749423668804988, + "loss": 1.160158634185791, + "mean_token_accuracy": 0.7439006865024567, + "num_tokens": 12398230.0, + "step": 3070 + }, + { + "entropy": 1.2317523241043091, + "epoch": 0.9061488673139159, + "grad_norm": 0.45237967371940613, + "learning_rate": 0.00016725458327150383, + "loss": 1.228554630279541, + "mean_token_accuracy": 0.728976035118103, + "num_tokens": 12438762.0, + "step": 3080 + }, + { + "entropy": 1.1924173712730408, + "epoch": 0.9090909090909091, + "grad_norm": 0.4868202805519104, + "learning_rate": 0.00016701422249144985, + "loss": 1.2131217956542968, + "mean_token_accuracy": 0.7263190448284149, + "num_tokens": 12479195.0, + "step": 3090 + }, + { + "entropy": 1.159307700395584, + "epoch": 0.9120329508679024, + "grad_norm": 0.44851839542388916, + "learning_rate": 0.00016677315687593048, + "loss": 1.1793177604675293, + "mean_token_accuracy": 0.7422453939914704, + "num_tokens": 12518938.0, + "step": 3100 + }, + { + "epoch": 0.9120329508679024, + "eval_entropy": 1.193232403965447, + "eval_loss": 1.2098932266235352, + "eval_mean_token_accuracy": 0.7319249039872737, + "eval_num_tokens": 12518938.0, + "eval_runtime": 117.0019, + "eval_samples_per_second": 26.025, + "eval_steps_per_second": 3.256, + "step": 3100 + }, + { + "entropy": 1.2515848875045776, + "epoch": 0.9149749926448956, + "grad_norm": 0.46064046025276184, + "learning_rate": 0.00016653138896040144, + "loss": 1.24728364944458, + "mean_token_accuracy": 0.7212517559528351, + "num_tokens": 12559307.0, + "step": 3110 + }, + { + "entropy": 1.2065839052200318, + "epoch": 0.9179170344218888, + "grad_norm": 0.4273196756839752, + "learning_rate": 0.00016628892128770506, + "loss": 1.2376407623291015, + "mean_token_accuracy": 0.7283597230911255, + "num_tokens": 12599752.0, + "step": 3120 + }, + { + "entropy": 1.245788073539734, + "epoch": 0.9208590761988821, + "grad_norm": 0.4527672529220581, + "learning_rate": 0.0001660457564080435, + "loss": 1.2551823616027833, + "mean_token_accuracy": 0.7254024922847748, + "num_tokens": 12640368.0, + "step": 3130 + }, + { + "entropy": 1.2258686184883119, + "epoch": 0.9238011179758753, + "grad_norm": 0.4471310079097748, + "learning_rate": 0.00016580189687895192, + "loss": 1.2315049171447754, + "mean_token_accuracy": 0.7266620457172394, + "num_tokens": 12681011.0, + "step": 3140 + }, + { + "entropy": 1.2006858110427856, + "epoch": 0.9267431597528685, + "grad_norm": 0.5084729790687561, + "learning_rate": 0.00016555734526527163, + "loss": 1.20996150970459, + "mean_token_accuracy": 0.733843994140625, + "num_tokens": 12721291.0, + "step": 3150 + }, + { + "entropy": 1.240491509437561, + "epoch": 0.9296852015298617, + "grad_norm": 0.5322201251983643, + "learning_rate": 0.000165312104139123, + "loss": 1.2419602394104003, + "mean_token_accuracy": 0.7259994149208069, + "num_tokens": 12761832.0, + "step": 3160 + }, + { + "entropy": 1.2303740501403808, + "epoch": 0.932627243306855, + "grad_norm": 0.48525163531303406, + "learning_rate": 0.00016506617607987863, + "loss": 1.266739845275879, + "mean_token_accuracy": 0.7198700964450836, + "num_tokens": 12802050.0, + "step": 3170 + }, + { + "entropy": 1.2406162559986114, + "epoch": 0.9355692850838482, + "grad_norm": 0.5300395488739014, + "learning_rate": 0.0001648195636741359, + "loss": 1.236351776123047, + "mean_token_accuracy": 0.7296032607555389, + "num_tokens": 12842323.0, + "step": 3180 + }, + { + "entropy": 1.1737658739089967, + "epoch": 0.9385113268608414, + "grad_norm": 0.580947756767273, + "learning_rate": 0.0001645722695156901, + "loss": 1.1983850479125977, + "mean_token_accuracy": 0.7319401502609253, + "num_tokens": 12882948.0, + "step": 3190 + }, + { + "entropy": 1.2107102632522584, + "epoch": 0.9414533686378347, + "grad_norm": 0.4780050814151764, + "learning_rate": 0.00016432429620550688, + "loss": 1.1875343322753906, + "mean_token_accuracy": 0.7371236264705658, + "num_tokens": 12923316.0, + "step": 3200 + }, + { + "epoch": 0.9414533686378347, + "eval_entropy": 1.192403563051399, + "eval_loss": 1.204405426979065, + "eval_mean_token_accuracy": 0.7331006780384094, + "eval_num_tokens": 12923316.0, + "eval_runtime": 117.0112, + "eval_samples_per_second": 26.023, + "eval_steps_per_second": 3.256, + "step": 3200 + }, + { + "entropy": 1.1371994316577911, + "epoch": 0.9443954104148279, + "grad_norm": 0.5067015290260315, + "learning_rate": 0.00016407564635169503, + "loss": 1.186887264251709, + "mean_token_accuracy": 0.7382100522518158, + "num_tokens": 12963817.0, + "step": 3210 + }, + { + "entropy": 1.217560636997223, + "epoch": 0.9473374521918211, + "grad_norm": 0.4357713460922241, + "learning_rate": 0.00016382632256947908, + "loss": 1.2167092323303224, + "mean_token_accuracy": 0.7259755432605743, + "num_tokens": 13004277.0, + "step": 3220 + }, + { + "entropy": 1.2078778982162475, + "epoch": 0.9502794939688144, + "grad_norm": 0.45559161901474, + "learning_rate": 0.0001635763274811716, + "loss": 1.2228084564208985, + "mean_token_accuracy": 0.7331404030323029, + "num_tokens": 13044683.0, + "step": 3230 + }, + { + "entropy": 1.210153341293335, + "epoch": 0.9532215357458076, + "grad_norm": 0.48570674657821655, + "learning_rate": 0.00016332566371614595, + "loss": 1.2366246223449706, + "mean_token_accuracy": 0.7288424909114838, + "num_tokens": 13085217.0, + "step": 3240 + }, + { + "entropy": 1.200468325614929, + "epoch": 0.9561635775228008, + "grad_norm": 0.43572553992271423, + "learning_rate": 0.0001630743339108083, + "loss": 1.1855230331420898, + "mean_token_accuracy": 0.7426514804363251, + "num_tokens": 13125762.0, + "step": 3250 + }, + { + "entropy": 1.192023515701294, + "epoch": 0.9591056192997941, + "grad_norm": 0.6283465623855591, + "learning_rate": 0.00016282234070857, + "loss": 1.2304601669311523, + "mean_token_accuracy": 0.7263703107833862, + "num_tokens": 13166141.0, + "step": 3260 + }, + { + "entropy": 1.3121253371238708, + "epoch": 0.9620476610767873, + "grad_norm": 0.4841027855873108, + "learning_rate": 0.0001625696867598199, + "loss": 1.3120348930358887, + "mean_token_accuracy": 0.7042850077152252, + "num_tokens": 13206360.0, + "step": 3270 + }, + { + "entropy": 1.2325164914131164, + "epoch": 0.9649897028537805, + "grad_norm": 0.5523087978363037, + "learning_rate": 0.0001623163747218964, + "loss": 1.2477660179138184, + "mean_token_accuracy": 0.7236345887184144, + "num_tokens": 13246935.0, + "step": 3280 + }, + { + "entropy": 1.2571437239646912, + "epoch": 0.9679317446307738, + "grad_norm": 0.4331943690776825, + "learning_rate": 0.00016206240725905938, + "loss": 1.263328742980957, + "mean_token_accuracy": 0.7202905654907227, + "num_tokens": 13287422.0, + "step": 3290 + }, + { + "entropy": 1.132285052537918, + "epoch": 0.970873786407767, + "grad_norm": 0.4219360053539276, + "learning_rate": 0.00016180778704246238, + "loss": 1.1526874542236327, + "mean_token_accuracy": 0.745033609867096, + "num_tokens": 13327893.0, + "step": 3300 + }, + { + "epoch": 0.970873786407767, + "eval_entropy": 1.2065895927233958, + "eval_loss": 1.2001736164093018, + "eval_mean_token_accuracy": 0.7332249573209467, + "eval_num_tokens": 13327893.0, + "eval_runtime": 117.023, + "eval_samples_per_second": 26.021, + "eval_steps_per_second": 3.256, + "step": 3300 + }, + { + "entropy": 1.2010907173156737, + "epoch": 0.9738158281847602, + "grad_norm": 0.45471927523612976, + "learning_rate": 0.00016155251675012433, + "loss": 1.2022515296936036, + "mean_token_accuracy": 0.7346913456916809, + "num_tokens": 13368297.0, + "step": 3310 + }, + { + "entropy": 1.235162889957428, + "epoch": 0.9767578699617535, + "grad_norm": 0.4965246319770813, + "learning_rate": 0.0001612965990669015, + "loss": 1.2688727378845215, + "mean_token_accuracy": 0.7194641828536987, + "num_tokens": 13408602.0, + "step": 3320 + }, + { + "entropy": 1.3277331352233888, + "epoch": 0.9796999117387467, + "grad_norm": 0.5565961599349976, + "learning_rate": 0.00016104003668445925, + "loss": 1.3193525314331054, + "mean_token_accuracy": 0.7008399486541748, + "num_tokens": 13448838.0, + "step": 3330 + }, + { + "entropy": 1.1913153290748597, + "epoch": 0.9826419535157399, + "grad_norm": 0.49679502844810486, + "learning_rate": 0.00016078283230124365, + "loss": 1.2235237121582032, + "mean_token_accuracy": 0.7243378221988678, + "num_tokens": 13489366.0, + "step": 3340 + }, + { + "entropy": 1.2956088483333588, + "epoch": 0.9855839952927331, + "grad_norm": 0.49954238533973694, + "learning_rate": 0.00016052498862245313, + "loss": 1.2841950416564942, + "mean_token_accuracy": 0.7132591784000397, + "num_tokens": 13529414.0, + "step": 3350 + }, + { + "entropy": 1.182485854625702, + "epoch": 0.9885260370697264, + "grad_norm": 0.4281366169452667, + "learning_rate": 0.00016026650836001012, + "loss": 1.2035736083984374, + "mean_token_accuracy": 0.7331153571605682, + "num_tokens": 13569904.0, + "step": 3360 + }, + { + "entropy": 1.2611793637275697, + "epoch": 0.9914680788467196, + "grad_norm": 0.4736562669277191, + "learning_rate": 0.0001600073942325323, + "loss": 1.2855722427368164, + "mean_token_accuracy": 0.7184321641921997, + "num_tokens": 13610365.0, + "step": 3370 + }, + { + "entropy": 1.2419211864471436, + "epoch": 0.9944101206237128, + "grad_norm": 0.45400166511535645, + "learning_rate": 0.00015974764896530433, + "loss": 1.2359369277954102, + "mean_token_accuracy": 0.7274314403533936, + "num_tokens": 13650738.0, + "step": 3380 + }, + { + "entropy": 1.16963592171669, + "epoch": 0.9973521624007061, + "grad_norm": 0.5148010849952698, + "learning_rate": 0.0001594872752902489, + "loss": 1.1966312408447266, + "mean_token_accuracy": 0.7361967086791992, + "num_tokens": 13691179.0, + "step": 3390 + }, + { + "entropy": 1.263899064064026, + "epoch": 1.0002942041776994, + "grad_norm": 0.47132888436317444, + "learning_rate": 0.0001592262759458981, + "loss": 1.2513002395629882, + "mean_token_accuracy": 0.7206339836120605, + "num_tokens": 13729142.0, + "step": 3400 + }, + { + "epoch": 1.0002942041776994, + "eval_entropy": 1.1728849974204236, + "eval_loss": 1.196743369102478, + "eval_mean_token_accuracy": 0.7349204848131795, + "eval_num_tokens": 13729142.0, + "eval_runtime": 116.9564, + "eval_samples_per_second": 26.035, + "eval_steps_per_second": 3.258, + "step": 3400 + }, + { + "entropy": 1.0056753396987914, + "epoch": 1.0032362459546926, + "grad_norm": 0.48898911476135254, + "learning_rate": 0.00015896465367736467, + "loss": 0.9896906852722168, + "mean_token_accuracy": 0.7708241283893585, + "num_tokens": 13769748.0, + "step": 3410 + }, + { + "entropy": 0.9479109048843384, + "epoch": 1.0061782877316858, + "grad_norm": 0.5644809603691101, + "learning_rate": 0.00015870241123631303, + "loss": 0.969275951385498, + "mean_token_accuracy": 0.7748298406600952, + "num_tokens": 13810433.0, + "step": 3420 + }, + { + "entropy": 1.0169321179389954, + "epoch": 1.009120329508679, + "grad_norm": 0.5554332733154297, + "learning_rate": 0.00015843955138093043, + "loss": 1.0067197799682617, + "mean_token_accuracy": 0.7651751101016998, + "num_tokens": 13850895.0, + "step": 3430 + }, + { + "entropy": 0.9419119358062744, + "epoch": 1.0120623712856722, + "grad_norm": 0.5447728037834167, + "learning_rate": 0.00015817607687589787, + "loss": 0.9617524147033691, + "mean_token_accuracy": 0.771773511171341, + "num_tokens": 13890924.0, + "step": 3440 + }, + { + "entropy": 0.9768797099590302, + "epoch": 1.0150044130626654, + "grad_norm": 0.5598154664039612, + "learning_rate": 0.00015791199049236106, + "loss": 0.984192180633545, + "mean_token_accuracy": 0.7716309785842895, + "num_tokens": 13931279.0, + "step": 3450 + }, + { + "entropy": 0.9213016629219055, + "epoch": 1.0179464548396586, + "grad_norm": 0.5594082474708557, + "learning_rate": 0.00015764729500790132, + "loss": 0.925694465637207, + "mean_token_accuracy": 0.7817917823791504, + "num_tokens": 13971714.0, + "step": 3460 + }, + { + "entropy": 0.9738560080528259, + "epoch": 1.020888496616652, + "grad_norm": 0.576998233795166, + "learning_rate": 0.00015738199320650622, + "loss": 0.9819319725036622, + "mean_token_accuracy": 0.7748230636119843, + "num_tokens": 14012038.0, + "step": 3470 + }, + { + "entropy": 0.9529994606971741, + "epoch": 1.0238305383936452, + "grad_norm": 0.507428765296936, + "learning_rate": 0.00015711608787854041, + "loss": 0.968116569519043, + "mean_token_accuracy": 0.7762803137302399, + "num_tokens": 14052608.0, + "step": 3480 + }, + { + "entropy": 0.9343097984790802, + "epoch": 1.0267725801706384, + "grad_norm": 0.48517364263534546, + "learning_rate": 0.0001568495818207163, + "loss": 0.9297596931457519, + "mean_token_accuracy": 0.783079195022583, + "num_tokens": 14093119.0, + "step": 3490 + }, + { + "entropy": 0.9860086500644684, + "epoch": 1.0297146219476316, + "grad_norm": 0.4717691242694855, + "learning_rate": 0.00015658247783606455, + "loss": 1.004935073852539, + "mean_token_accuracy": 0.765727037191391, + "num_tokens": 14133569.0, + "step": 3500 + }, + { + "epoch": 1.0297146219476316, + "eval_entropy": 1.0715967291609196, + "eval_loss": 1.2108769416809082, + "eval_mean_token_accuracy": 0.7342690184047529, + "eval_num_tokens": 14133569.0, + "eval_runtime": 116.9449, + "eval_samples_per_second": 26.038, + "eval_steps_per_second": 3.258, + "step": 3500 + }, + { + "entropy": 0.9845475435256958, + "epoch": 1.0326566637246248, + "grad_norm": 0.6196256875991821, + "learning_rate": 0.00015631477873390463, + "loss": 0.9737442970275879, + "mean_token_accuracy": 0.7720641791820526, + "num_tokens": 14174112.0, + "step": 3510 + }, + { + "entropy": 0.9606890559196473, + "epoch": 1.035598705501618, + "grad_norm": 0.5352163910865784, + "learning_rate": 0.00015604648732981535, + "loss": 0.9936755180358887, + "mean_token_accuracy": 0.767872554063797, + "num_tokens": 14214732.0, + "step": 3520 + }, + { + "entropy": 0.9683861494064331, + "epoch": 1.0385407472786115, + "grad_norm": 0.5286763906478882, + "learning_rate": 0.00015577760644560506, + "loss": 0.9670245170593261, + "mean_token_accuracy": 0.7790086328983307, + "num_tokens": 14255029.0, + "step": 3530 + }, + { + "entropy": 0.9281998634338379, + "epoch": 1.0414827890556047, + "grad_norm": 0.5544895529747009, + "learning_rate": 0.0001555081389092822, + "loss": 0.9275782585144043, + "mean_token_accuracy": 0.7800623893737793, + "num_tokens": 14294859.0, + "step": 3540 + }, + { + "entropy": 0.9516816318035126, + "epoch": 1.0444248308325979, + "grad_norm": 0.5528038144111633, + "learning_rate": 0.0001552380875550253, + "loss": 0.971556282043457, + "mean_token_accuracy": 0.7744876265525817, + "num_tokens": 14335364.0, + "step": 3550 + }, + { + "entropy": 1.0375867664813996, + "epoch": 1.047366872609591, + "grad_norm": 0.5946918725967407, + "learning_rate": 0.00015496745522315352, + "loss": 1.036135482788086, + "mean_token_accuracy": 0.7563816487789154, + "num_tokens": 14375785.0, + "step": 3560 + }, + { + "entropy": 0.9602185845375061, + "epoch": 1.0503089143865842, + "grad_norm": 0.55278080701828, + "learning_rate": 0.00015469624476009637, + "loss": 0.9790426254272461, + "mean_token_accuracy": 0.7711789608001709, + "num_tokens": 14416298.0, + "step": 3570 + }, + { + "entropy": 0.9802630722522736, + "epoch": 1.0532509561635774, + "grad_norm": 0.5366263389587402, + "learning_rate": 0.00015442445901836407, + "loss": 0.9828317642211915, + "mean_token_accuracy": 0.7732825756072998, + "num_tokens": 14456758.0, + "step": 3580 + }, + { + "entropy": 0.9827463984489441, + "epoch": 1.0561929979405709, + "grad_norm": 0.5232464075088501, + "learning_rate": 0.0001541521008565174, + "loss": 0.9965373039245605, + "mean_token_accuracy": 0.7664778053760528, + "num_tokens": 14497079.0, + "step": 3590 + }, + { + "entropy": 0.9974855959415436, + "epoch": 1.059135039717564, + "grad_norm": 0.5640583634376526, + "learning_rate": 0.0001538791731391377, + "loss": 1.001423168182373, + "mean_token_accuracy": 0.7696728587150574, + "num_tokens": 14537442.0, + "step": 3600 + }, + { + "epoch": 1.059135039717564, + "eval_entropy": 1.059611619926813, + "eval_loss": 1.2121630907058716, + "eval_mean_token_accuracy": 0.7346383677379978, + "eval_num_tokens": 14537442.0, + "eval_runtime": 116.9964, + "eval_samples_per_second": 26.026, + "eval_steps_per_second": 3.257, + "step": 3600 + }, + { + "entropy": 1.019939649105072, + "epoch": 1.0620770814945573, + "grad_norm": 0.6510360836982727, + "learning_rate": 0.00015360567873679682, + "loss": 1.0335427284240724, + "mean_token_accuracy": 0.7557238221168519, + "num_tokens": 14577903.0, + "step": 3610 + }, + { + "entropy": 0.9929381251335144, + "epoch": 1.0650191232715505, + "grad_norm": 0.4985535740852356, + "learning_rate": 0.00015333162052602663, + "loss": 0.9860858917236328, + "mean_token_accuracy": 0.7693518400192261, + "num_tokens": 14618362.0, + "step": 3620 + }, + { + "entropy": 1.0021346986293793, + "epoch": 1.0679611650485437, + "grad_norm": 0.5036218762397766, + "learning_rate": 0.00015305700138928914, + "loss": 1.0195841789245605, + "mean_token_accuracy": 0.7599446833133697, + "num_tokens": 14658524.0, + "step": 3630 + }, + { + "entropy": 0.9681568443775177, + "epoch": 1.0709032068255369, + "grad_norm": 0.685461699962616, + "learning_rate": 0.00015278182421494597, + "loss": 0.9650713920593261, + "mean_token_accuracy": 0.7742046892642975, + "num_tokens": 14699023.0, + "step": 3640 + }, + { + "entropy": 0.9974366188049316, + "epoch": 1.07384524860253, + "grad_norm": 0.5122579336166382, + "learning_rate": 0.0001525060918972279, + "loss": 1.010261631011963, + "mean_token_accuracy": 0.7668596982955933, + "num_tokens": 14739433.0, + "step": 3650 + }, + { + "entropy": 0.9449751615524292, + "epoch": 1.0767872903795235, + "grad_norm": 0.5020188689231873, + "learning_rate": 0.00015222980733620473, + "loss": 0.9527727127075195, + "mean_token_accuracy": 0.7792690694332123, + "num_tokens": 14779729.0, + "step": 3660 + }, + { + "entropy": 0.9212399244308471, + "epoch": 1.0797293321565167, + "grad_norm": 0.4754965901374817, + "learning_rate": 0.0001519529734377545, + "loss": 0.931304931640625, + "mean_token_accuracy": 0.780491977930069, + "num_tokens": 14819988.0, + "step": 3670 + }, + { + "entropy": 0.9701619386672974, + "epoch": 1.0826713739335099, + "grad_norm": 0.51582270860672, + "learning_rate": 0.0001516755931135329, + "loss": 0.9845802307128906, + "mean_token_accuracy": 0.7685158431529999, + "num_tokens": 14860489.0, + "step": 3680 + }, + { + "entropy": 0.9930408954620361, + "epoch": 1.085613415710503, + "grad_norm": 0.5440912246704102, + "learning_rate": 0.00015139766928094303, + "loss": 0.9996217727661133, + "mean_token_accuracy": 0.7675297498703003, + "num_tokens": 14901041.0, + "step": 3690 + }, + { + "entropy": 0.9450788199901581, + "epoch": 1.0885554574874963, + "grad_norm": 0.5000481605529785, + "learning_rate": 0.00015111920486310417, + "loss": 0.9332949638366699, + "mean_token_accuracy": 0.7813515663146973, + "num_tokens": 14941532.0, + "step": 3700 + }, + { + "epoch": 1.0885554574874963, + "eval_entropy": 1.0368518566522074, + "eval_loss": 1.209765076637268, + "eval_mean_token_accuracy": 0.7356539055744181, + "eval_num_tokens": 14941532.0, + "eval_runtime": 117.1062, + "eval_samples_per_second": 26.002, + "eval_steps_per_second": 3.253, + "step": 3700 + }, + { + "entropy": 0.9497840762138366, + "epoch": 1.0914974992644895, + "grad_norm": 0.5802638530731201, + "learning_rate": 0.00015084020278882153, + "loss": 0.9746996879577636, + "mean_token_accuracy": 0.7703352689743042, + "num_tokens": 14981818.0, + "step": 3710 + }, + { + "entropy": 0.9469231545925141, + "epoch": 1.0944395410414829, + "grad_norm": 0.5750518441200256, + "learning_rate": 0.00015056066599255502, + "loss": 0.9662343025207519, + "mean_token_accuracy": 0.7726805984973908, + "num_tokens": 15022307.0, + "step": 3720 + }, + { + "entropy": 1.0127877593040466, + "epoch": 1.097381582818476, + "grad_norm": 0.5340924859046936, + "learning_rate": 0.0001502805974143888, + "loss": 0.9845216751098633, + "mean_token_accuracy": 0.7723982453346252, + "num_tokens": 15062810.0, + "step": 3730 + }, + { + "entropy": 0.974898761510849, + "epoch": 1.1003236245954693, + "grad_norm": 0.5145857334136963, + "learning_rate": 0.00015000000000000001, + "loss": 1.0081477165222168, + "mean_token_accuracy": 0.7643806636333466, + "num_tokens": 15103409.0, + "step": 3740 + }, + { + "entropy": 0.9510980069637298, + "epoch": 1.1032656663724625, + "grad_norm": 0.5267722606658936, + "learning_rate": 0.00014971887670062802, + "loss": 0.9365800857543946, + "mean_token_accuracy": 0.780946570634842, + "num_tokens": 15143947.0, + "step": 3750 + }, + { + "entropy": 1.0326339960098267, + "epoch": 1.1062077081494557, + "grad_norm": 0.517667293548584, + "learning_rate": 0.0001494372304730432, + "loss": 1.0606879234313964, + "mean_token_accuracy": 0.7520448863506317, + "num_tokens": 15184466.0, + "step": 3760 + }, + { + "entropy": 0.9095244884490967, + "epoch": 1.1091497499264489, + "grad_norm": 0.5059592127799988, + "learning_rate": 0.00014915506427951605, + "loss": 0.9141671180725097, + "mean_token_accuracy": 0.7883690059185028, + "num_tokens": 15224876.0, + "step": 3770 + }, + { + "entropy": 0.922091954946518, + "epoch": 1.1120917917034423, + "grad_norm": 0.5040479898452759, + "learning_rate": 0.0001488723810877858, + "loss": 0.9369124412536621, + "mean_token_accuracy": 0.7827723801136017, + "num_tokens": 15265334.0, + "step": 3780 + }, + { + "entropy": 0.9985373139381408, + "epoch": 1.1150338334804355, + "grad_norm": 0.593623697757721, + "learning_rate": 0.00014858918387102943, + "loss": 1.003388023376465, + "mean_token_accuracy": 0.7662831544876099, + "num_tokens": 15305821.0, + "step": 3790 + }, + { + "entropy": 1.0036361336708068, + "epoch": 1.1179758752574287, + "grad_norm": 0.46527841687202454, + "learning_rate": 0.00014830547560783013, + "loss": 1.0003900527954102, + "mean_token_accuracy": 0.7689646422863007, + "num_tokens": 15346212.0, + "step": 3800 + }, + { + "epoch": 1.1179758752574287, + "eval_entropy": 1.058281412587704, + "eval_loss": 1.207934856414795, + "eval_mean_token_accuracy": 0.7360553802467706, + "eval_num_tokens": 15346212.0, + "eval_runtime": 117.1795, + "eval_samples_per_second": 25.986, + "eval_steps_per_second": 3.251, + "step": 3800 + }, + { + "entropy": 1.0539171755313874, + "epoch": 1.120917917034422, + "grad_norm": 0.5680721402168274, + "learning_rate": 0.00014802125928214626, + "loss": 1.0743833541870118, + "mean_token_accuracy": 0.7493933796882629, + "num_tokens": 15386724.0, + "step": 3810 + }, + { + "entropy": 0.916973739862442, + "epoch": 1.123859958811415, + "grad_norm": 0.5414533019065857, + "learning_rate": 0.0001477365378832797, + "loss": 0.9400577545166016, + "mean_token_accuracy": 0.7826773941516876, + "num_tokens": 15427047.0, + "step": 3820 + }, + { + "entropy": 1.044243198633194, + "epoch": 1.1268020005884083, + "grad_norm": 0.5607292056083679, + "learning_rate": 0.0001474513144058447, + "loss": 1.0428730964660644, + "mean_token_accuracy": 0.7559836447238922, + "num_tokens": 15467253.0, + "step": 3830 + }, + { + "entropy": 0.9271435976028443, + "epoch": 1.1297440423654015, + "grad_norm": 0.5749255418777466, + "learning_rate": 0.0001471655918497361, + "loss": 0.9598716735839844, + "mean_token_accuracy": 0.7791375935077667, + "num_tokens": 15507806.0, + "step": 3840 + }, + { + "entropy": 0.9984059453010559, + "epoch": 1.132686084142395, + "grad_norm": 0.671238362789154, + "learning_rate": 0.00014687937322009793, + "loss": 0.9737402915954589, + "mean_token_accuracy": 0.7700112521648407, + "num_tokens": 15548383.0, + "step": 3850 + }, + { + "entropy": 0.9276021301746369, + "epoch": 1.135628125919388, + "grad_norm": 0.5724707841873169, + "learning_rate": 0.00014659266152729176, + "loss": 0.9540791511535645, + "mean_token_accuracy": 0.7781682848930359, + "num_tokens": 15588893.0, + "step": 3860 + }, + { + "entropy": 0.9864838421344757, + "epoch": 1.1385701676963813, + "grad_norm": 0.5926504731178284, + "learning_rate": 0.0001463054597868651, + "loss": 0.9837197303771973, + "mean_token_accuracy": 0.7700894057750702, + "num_tokens": 15629429.0, + "step": 3870 + }, + { + "entropy": 1.044099175930023, + "epoch": 1.1415122094733745, + "grad_norm": 0.5998210906982422, + "learning_rate": 0.00014601777101951957, + "loss": 1.064276123046875, + "mean_token_accuracy": 0.7496702373027802, + "num_tokens": 15669112.0, + "step": 3880 + }, + { + "entropy": 0.9448180139064789, + "epoch": 1.1444542512503677, + "grad_norm": 0.5697433948516846, + "learning_rate": 0.00014572959825107922, + "loss": 0.939006233215332, + "mean_token_accuracy": 0.7818219542503357, + "num_tokens": 15709418.0, + "step": 3890 + }, + { + "entropy": 0.9624788880348205, + "epoch": 1.147396293027361, + "grad_norm": 0.5536630749702454, + "learning_rate": 0.0001454409445124587, + "loss": 0.9612356185913086, + "mean_token_accuracy": 0.7735930144786834, + "num_tokens": 15749926.0, + "step": 3900 + }, + { + "epoch": 1.147396293027361, + "eval_entropy": 1.002416886056815, + "eval_loss": 1.2163069248199463, + "eval_mean_token_accuracy": 0.736245130147208, + "eval_num_tokens": 15749926.0, + "eval_runtime": 117.0848, + "eval_samples_per_second": 26.007, + "eval_steps_per_second": 3.254, + "step": 3900 + }, + { + "entropy": 0.9970716178417206, + "epoch": 1.150338334804354, + "grad_norm": 0.551414966583252, + "learning_rate": 0.00014515181283963132, + "loss": 1.024658489227295, + "mean_token_accuracy": 0.7571725130081177, + "num_tokens": 15790231.0, + "step": 3910 + }, + { + "entropy": 1.0260444402694702, + "epoch": 1.1532803765813475, + "grad_norm": 0.565596878528595, + "learning_rate": 0.0001448622062735972, + "loss": 1.0220839500427246, + "mean_token_accuracy": 0.7611811697483063, + "num_tokens": 15830791.0, + "step": 3920 + }, + { + "entropy": 0.9745386719703675, + "epoch": 1.1562224183583407, + "grad_norm": 0.522496223449707, + "learning_rate": 0.00014457212786035122, + "loss": 0.9898022651672364, + "mean_token_accuracy": 0.7702635705471039, + "num_tokens": 15871330.0, + "step": 3930 + }, + { + "entropy": 1.00519802570343, + "epoch": 1.159164460135334, + "grad_norm": 0.579716145992279, + "learning_rate": 0.00014428158065085098, + "loss": 1.0124700546264649, + "mean_token_accuracy": 0.7646716058254241, + "num_tokens": 15911828.0, + "step": 3940 + }, + { + "entropy": 0.9504435777664184, + "epoch": 1.1621065019123271, + "grad_norm": 0.5688005089759827, + "learning_rate": 0.00014399056770098478, + "loss": 0.9538630485534668, + "mean_token_accuracy": 0.776702755689621, + "num_tokens": 15952424.0, + "step": 3950 + }, + { + "entropy": 0.972985816001892, + "epoch": 1.1650485436893203, + "grad_norm": 0.5217563509941101, + "learning_rate": 0.00014369909207153947, + "loss": 0.9882010459899903, + "mean_token_accuracy": 0.7707372605800629, + "num_tokens": 15992995.0, + "step": 3960 + }, + { + "entropy": 1.0266568660736084, + "epoch": 1.1679905854663137, + "grad_norm": 0.607991099357605, + "learning_rate": 0.00014340715682816806, + "loss": 1.0269791603088378, + "mean_token_accuracy": 0.7627450406551362, + "num_tokens": 16033357.0, + "step": 3970 + }, + { + "entropy": 1.023914647102356, + "epoch": 1.170932627243307, + "grad_norm": 0.5779170393943787, + "learning_rate": 0.00014311476504135794, + "loss": 1.0473231315612792, + "mean_token_accuracy": 0.7546798884868622, + "num_tokens": 16073799.0, + "step": 3980 + }, + { + "entropy": 0.9414635598659515, + "epoch": 1.1738746690203001, + "grad_norm": 0.5319372415542603, + "learning_rate": 0.00014282191978639799, + "loss": 0.9280409812927246, + "mean_token_accuracy": 0.7800300478935241, + "num_tokens": 16114313.0, + "step": 3990 + }, + { + "entropy": 0.9413889229297638, + "epoch": 1.1768167107972933, + "grad_norm": 0.6013742089271545, + "learning_rate": 0.00014252862414334665, + "loss": 0.9760286331176757, + "mean_token_accuracy": 0.7707565903663636, + "num_tokens": 16154854.0, + "step": 4000 + }, + { + "epoch": 1.1768167107972933, + "eval_entropy": 1.0498265117954395, + "eval_loss": 1.2053319215774536, + "eval_mean_token_accuracy": 0.7371757869019596, + "eval_num_tokens": 16154854.0, + "eval_runtime": 116.9953, + "eval_samples_per_second": 26.027, + "eval_steps_per_second": 3.257, + "step": 4000 + }, + { + "entropy": 1.01264528632164, + "epoch": 1.1797587525742865, + "grad_norm": 0.5052332282066345, + "learning_rate": 0.00014223488119699944, + "loss": 0.9941823959350586, + "mean_token_accuracy": 0.7712079107761383, + "num_tokens": 16195404.0, + "step": 4010 + }, + { + "entropy": 0.9542388617992401, + "epoch": 1.1827007943512797, + "grad_norm": 0.6624171733856201, + "learning_rate": 0.00014194069403685643, + "loss": 0.9699134826660156, + "mean_token_accuracy": 0.7701153516769409, + "num_tokens": 16235933.0, + "step": 4020 + }, + { + "entropy": 0.9835664987564087, + "epoch": 1.185642836128273, + "grad_norm": 0.6638393998146057, + "learning_rate": 0.00014164606575708984, + "loss": 1.004053497314453, + "mean_token_accuracy": 0.7658522069454193, + "num_tokens": 16276402.0, + "step": 4030 + }, + { + "entropy": 0.9395282685756683, + "epoch": 1.1885848779052663, + "grad_norm": 0.5497994422912598, + "learning_rate": 0.0001413509994565114, + "loss": 0.9349452972412109, + "mean_token_accuracy": 0.7839109897613525, + "num_tokens": 16316751.0, + "step": 4040 + }, + { + "entropy": 0.9585356175899505, + "epoch": 1.1915269196822595, + "grad_norm": 0.5739843845367432, + "learning_rate": 0.00014105549823853987, + "loss": 0.9759317398071289, + "mean_token_accuracy": 0.7685169577598572, + "num_tokens": 16357253.0, + "step": 4050 + }, + { + "entropy": 0.9427942156791687, + "epoch": 1.1944689614592527, + "grad_norm": 0.5962517261505127, + "learning_rate": 0.00014075956521116827, + "loss": 0.9541123390197754, + "mean_token_accuracy": 0.7789060473442078, + "num_tokens": 16397470.0, + "step": 4060 + }, + { + "entropy": 0.9317695260047912, + "epoch": 1.197411003236246, + "grad_norm": 0.5742291212081909, + "learning_rate": 0.00014046320348693134, + "loss": 0.9425789833068847, + "mean_token_accuracy": 0.7804486751556396, + "num_tokens": 16438035.0, + "step": 4070 + }, + { + "entropy": 0.9994288563728333, + "epoch": 1.2003530450132391, + "grad_norm": 0.628220796585083, + "learning_rate": 0.00014016641618287264, + "loss": 1.0083752632141114, + "mean_token_accuracy": 0.7678903639316559, + "num_tokens": 16478563.0, + "step": 4080 + }, + { + "entropy": 0.9661390006542205, + "epoch": 1.2032950867902323, + "grad_norm": 0.5717418789863586, + "learning_rate": 0.00013986920642051196, + "loss": 0.956269645690918, + "mean_token_accuracy": 0.7756177723407746, + "num_tokens": 16519059.0, + "step": 4090 + }, + { + "entropy": 0.9637384474277496, + "epoch": 1.2062371285672255, + "grad_norm": 0.5957377552986145, + "learning_rate": 0.00013957157732581227, + "loss": 0.9740910530090332, + "mean_token_accuracy": 0.7705820441246033, + "num_tokens": 16559585.0, + "step": 4100 + }, + { + "epoch": 1.2062371285672255, + "eval_entropy": 1.0279845802646297, + "eval_loss": 1.204230546951294, + "eval_mean_token_accuracy": 0.7378412169108554, + "eval_num_tokens": 16559585.0, + "eval_runtime": 116.9158, + "eval_samples_per_second": 26.044, + "eval_steps_per_second": 3.259, + "step": 4100 + }, + { + "entropy": 0.9299012839794158, + "epoch": 1.209179170344219, + "grad_norm": 0.5047076344490051, + "learning_rate": 0.00013927353202914704, + "loss": 0.9325620651245117, + "mean_token_accuracy": 0.7856215178966522, + "num_tokens": 16599897.0, + "step": 4110 + }, + { + "entropy": 0.9579800009727478, + "epoch": 1.2121212121212122, + "grad_norm": 0.6236160397529602, + "learning_rate": 0.00013897507366526717, + "loss": 0.9825181007385254, + "mean_token_accuracy": 0.771970808506012, + "num_tokens": 16640445.0, + "step": 4120 + }, + { + "entropy": 0.9490958392620087, + "epoch": 1.2150632538982054, + "grad_norm": 0.5913335084915161, + "learning_rate": 0.00013867620537326807, + "loss": 0.9593384742736817, + "mean_token_accuracy": 0.7774133384227753, + "num_tokens": 16681074.0, + "step": 4130 + }, + { + "entropy": 0.9436961114406586, + "epoch": 1.2180052956751986, + "grad_norm": 0.5566667318344116, + "learning_rate": 0.00013837693029655673, + "loss": 0.9432112693786621, + "mean_token_accuracy": 0.7793719172477722, + "num_tokens": 16721515.0, + "step": 4140 + }, + { + "entropy": 0.9555536270141601, + "epoch": 1.2209473374521917, + "grad_norm": 0.5549019575119019, + "learning_rate": 0.00013807725158281845, + "loss": 0.964748764038086, + "mean_token_accuracy": 0.7745252251625061, + "num_tokens": 16761971.0, + "step": 4150 + }, + { + "entropy": 1.0964530289173127, + "epoch": 1.2238893792291852, + "grad_norm": 0.5607911944389343, + "learning_rate": 0.000137777172383984, + "loss": 1.1134785652160644, + "mean_token_accuracy": 0.7422545254230499, + "num_tokens": 16801992.0, + "step": 4160 + }, + { + "entropy": 0.991803640127182, + "epoch": 1.2268314210061784, + "grad_norm": 0.5912790894508362, + "learning_rate": 0.00013747669585619621, + "loss": 0.9909832954406739, + "mean_token_accuracy": 0.7725964307785034, + "num_tokens": 16842361.0, + "step": 4170 + }, + { + "entropy": 0.9360088646411896, + "epoch": 1.2297734627831716, + "grad_norm": 0.5747997760772705, + "learning_rate": 0.00013717582515977703, + "loss": 0.9449604988098145, + "mean_token_accuracy": 0.7815338909626007, + "num_tokens": 16882838.0, + "step": 4180 + }, + { + "entropy": 0.9765557944774628, + "epoch": 1.2327155045601648, + "grad_norm": 0.6022299528121948, + "learning_rate": 0.000136874563459194, + "loss": 0.9870802879333496, + "mean_token_accuracy": 0.7712435305118561, + "num_tokens": 16923293.0, + "step": 4190 + }, + { + "entropy": 0.9462984323501586, + "epoch": 1.235657546337158, + "grad_norm": 0.6731317639350891, + "learning_rate": 0.0001365729139230273, + "loss": 0.9600887298583984, + "mean_token_accuracy": 0.776117742061615, + "num_tokens": 16963785.0, + "step": 4200 + }, + { + "epoch": 1.235657546337158, + "eval_entropy": 1.0305638493984703, + "eval_loss": 1.2021441459655762, + "eval_mean_token_accuracy": 0.7379450076834111, + "eval_num_tokens": 16963785.0, + "eval_runtime": 117.0903, + "eval_samples_per_second": 26.006, + "eval_steps_per_second": 3.254, + "step": 4200 + }, + { + "entropy": 0.8939768195152282, + "epoch": 1.2385995881141512, + "grad_norm": 0.6204389333724976, + "learning_rate": 0.00013627087972393605, + "loss": 0.8930576324462891, + "mean_token_accuracy": 0.7922165811061859, + "num_tokens": 17004178.0, + "step": 4210 + }, + { + "entropy": 0.9507308840751648, + "epoch": 1.2415416298911444, + "grad_norm": 0.6163705587387085, + "learning_rate": 0.00013596846403862535, + "loss": 0.9666755676269532, + "mean_token_accuracy": 0.7786275684833527, + "num_tokens": 17044627.0, + "step": 4220 + }, + { + "entropy": 0.975046980381012, + "epoch": 1.2444836716681378, + "grad_norm": 0.5746111273765564, + "learning_rate": 0.00013566567004781246, + "loss": 0.9781759262084961, + "mean_token_accuracy": 0.7753113329410553, + "num_tokens": 17084732.0, + "step": 4230 + }, + { + "entropy": 1.0526322185993195, + "epoch": 1.247425713445131, + "grad_norm": 0.5716729164123535, + "learning_rate": 0.00013536250093619369, + "loss": 1.072258472442627, + "mean_token_accuracy": 0.7464600443840027, + "num_tokens": 17125248.0, + "step": 4240 + }, + { + "entropy": 1.0479264855384827, + "epoch": 1.2503677552221242, + "grad_norm": 0.493936151266098, + "learning_rate": 0.0001350589598924107, + "loss": 1.0373758316040038, + "mean_token_accuracy": 0.755853122472763, + "num_tokens": 17165782.0, + "step": 4250 + }, + { + "entropy": 1.0287334680557252, + "epoch": 1.2533097969991174, + "grad_norm": 0.5333849191665649, + "learning_rate": 0.000134755050109017, + "loss": 1.04658203125, + "mean_token_accuracy": 0.7597375035285949, + "num_tokens": 17206071.0, + "step": 4260 + }, + { + "entropy": 1.031979387998581, + "epoch": 1.2562518387761106, + "grad_norm": 0.5658190250396729, + "learning_rate": 0.00013445077478244443, + "loss": 1.043376350402832, + "mean_token_accuracy": 0.7583476483821869, + "num_tokens": 17246566.0, + "step": 4270 + }, + { + "entropy": 0.9648872256278992, + "epoch": 1.2591938805531038, + "grad_norm": 0.5289658308029175, + "learning_rate": 0.00013414613711296952, + "loss": 0.9618735313415527, + "mean_token_accuracy": 0.7800238966941834, + "num_tokens": 17286973.0, + "step": 4280 + }, + { + "entropy": 1.0292047560214996, + "epoch": 1.262135922330097, + "grad_norm": 0.5604351162910461, + "learning_rate": 0.0001338411403046797, + "loss": 1.0540773391723632, + "mean_token_accuracy": 0.7540550827980042, + "num_tokens": 17327524.0, + "step": 4290 + }, + { + "entropy": 0.9784096240997314, + "epoch": 1.2650779641070904, + "grad_norm": 0.4879390597343445, + "learning_rate": 0.0001335357875654399, + "loss": 0.9683723449707031, + "mean_token_accuracy": 0.7743236780166626, + "num_tokens": 17367944.0, + "step": 4300 + }, + { + "epoch": 1.2650779641070904, + "eval_entropy": 1.024107585976443, + "eval_loss": 1.2039283514022827, + "eval_mean_token_accuracy": 0.7379744303195183, + "eval_num_tokens": 17367944.0, + "eval_runtime": 117.345, + "eval_samples_per_second": 25.949, + "eval_steps_per_second": 3.247, + "step": 4300 + }, + { + "entropy": 0.9147277295589447, + "epoch": 1.2680200058840836, + "grad_norm": 0.6553487181663513, + "learning_rate": 0.00013323008210685847, + "loss": 0.9428836822509765, + "mean_token_accuracy": 0.7800225138664245, + "num_tokens": 17408180.0, + "step": 4310 + }, + { + "entropy": 0.9862501919269562, + "epoch": 1.2709620476610768, + "grad_norm": 0.5519722700119019, + "learning_rate": 0.00013292402714425362, + "loss": 0.986777400970459, + "mean_token_accuracy": 0.7709893763065339, + "num_tokens": 17448767.0, + "step": 4320 + }, + { + "entropy": 0.984834861755371, + "epoch": 1.27390408943807, + "grad_norm": 0.6772357821464539, + "learning_rate": 0.0001326176258966195, + "loss": 1.0062361717224122, + "mean_token_accuracy": 0.7673668503761292, + "num_tokens": 17488778.0, + "step": 4330 + }, + { + "entropy": 1.0079510390758515, + "epoch": 1.2768461312150632, + "grad_norm": 0.6486600637435913, + "learning_rate": 0.00013231088158659245, + "loss": 1.0013746261596679, + "mean_token_accuracy": 0.7689732432365417, + "num_tokens": 17529357.0, + "step": 4340 + }, + { + "entropy": 0.930513882637024, + "epoch": 1.2797881729920566, + "grad_norm": 0.5784080028533936, + "learning_rate": 0.000132003797440417, + "loss": 0.941129207611084, + "mean_token_accuracy": 0.7789569735527039, + "num_tokens": 17569912.0, + "step": 4350 + }, + { + "entropy": 0.9446238994598388, + "epoch": 1.2827302147690498, + "grad_norm": 0.5628758072853088, + "learning_rate": 0.00013169637668791192, + "loss": 0.9538597106933594, + "mean_token_accuracy": 0.77459996342659, + "num_tokens": 17609948.0, + "step": 4360 + }, + { + "entropy": 0.9073866546154022, + "epoch": 1.285672256546043, + "grad_norm": 0.530838131904602, + "learning_rate": 0.0001313886225624364, + "loss": 0.9122503280639649, + "mean_token_accuracy": 0.7910377562046051, + "num_tokens": 17650396.0, + "step": 4370 + }, + { + "entropy": 0.9176636934280396, + "epoch": 1.2886142983230362, + "grad_norm": 0.5510920286178589, + "learning_rate": 0.00013108053830085585, + "loss": 0.9334745407104492, + "mean_token_accuracy": 0.7811925649642945, + "num_tokens": 17690414.0, + "step": 4380 + }, + { + "entropy": 0.9487575829029083, + "epoch": 1.2915563401000294, + "grad_norm": 0.5764872431755066, + "learning_rate": 0.00013077212714350807, + "loss": 0.9546429634094238, + "mean_token_accuracy": 0.7754026472568512, + "num_tokens": 17730995.0, + "step": 4390 + }, + { + "entropy": 0.9580233573913575, + "epoch": 1.2944983818770226, + "grad_norm": 0.510247528553009, + "learning_rate": 0.00013046339233416896, + "loss": 0.9728780746459961, + "mean_token_accuracy": 0.7761204540729523, + "num_tokens": 17771461.0, + "step": 4400 + }, + { + "epoch": 1.2944983818770226, + "eval_entropy": 1.0582224565541025, + "eval_loss": 1.1942919492721558, + "eval_mean_token_accuracy": 0.7387891846691842, + "eval_num_tokens": 17771461.0, + "eval_runtime": 116.9989, + "eval_samples_per_second": 26.026, + "eval_steps_per_second": 3.256, + "step": 4400 + }, + { + "entropy": 1.009384435415268, + "epoch": 1.2974404236540158, + "grad_norm": 0.6525319218635559, + "learning_rate": 0.00013015433712001853, + "loss": 1.0117576599121094, + "mean_token_accuracy": 0.7641228914260865, + "num_tokens": 17811743.0, + "step": 4410 + }, + { + "entropy": 1.0025621831417084, + "epoch": 1.3003824654310092, + "grad_norm": 0.573704719543457, + "learning_rate": 0.00012984496475160667, + "loss": 1.0045761108398437, + "mean_token_accuracy": 0.7632306814193726, + "num_tokens": 17852150.0, + "step": 4420 + }, + { + "entropy": 0.889021772146225, + "epoch": 1.3033245072080024, + "grad_norm": 0.5899218916893005, + "learning_rate": 0.00012953527848281907, + "loss": 0.9004623413085937, + "mean_token_accuracy": 0.7905885875225067, + "num_tokens": 17892636.0, + "step": 4430 + }, + { + "entropy": 0.9145092189311981, + "epoch": 1.3062665489849956, + "grad_norm": 0.5533415675163269, + "learning_rate": 0.00012922528157084288, + "loss": 0.9265445709228516, + "mean_token_accuracy": 0.7852813005447388, + "num_tokens": 17933064.0, + "step": 4440 + }, + { + "entropy": 0.9506058990955353, + "epoch": 1.3092085907619888, + "grad_norm": 0.6637270450592041, + "learning_rate": 0.00012891497727613254, + "loss": 0.9665675163269043, + "mean_token_accuracy": 0.7746671974658966, + "num_tokens": 17973649.0, + "step": 4450 + }, + { + "entropy": 0.9762279152870178, + "epoch": 1.312150632538982, + "grad_norm": 0.5837135314941406, + "learning_rate": 0.0001286043688623754, + "loss": 0.9735669136047364, + "mean_token_accuracy": 0.7707210063934327, + "num_tokens": 18013745.0, + "step": 4460 + }, + { + "entropy": 0.9729628086090087, + "epoch": 1.3150926743159752, + "grad_norm": 0.5597095489501953, + "learning_rate": 0.00012829345959645744, + "loss": 0.983339500427246, + "mean_token_accuracy": 0.7697983980178833, + "num_tokens": 18054069.0, + "step": 4470 + }, + { + "entropy": 0.9756720840930939, + "epoch": 1.3180347160929684, + "grad_norm": 0.5869277119636536, + "learning_rate": 0.00012798225274842902, + "loss": 0.9763286590576172, + "mean_token_accuracy": 0.772778332233429, + "num_tokens": 18094600.0, + "step": 4480 + }, + { + "entropy": 0.9129612624645234, + "epoch": 1.3209767578699618, + "grad_norm": 0.5384027361869812, + "learning_rate": 0.00012767075159147022, + "loss": 0.9275237083435058, + "mean_token_accuracy": 0.7822975754737854, + "num_tokens": 18135046.0, + "step": 4490 + }, + { + "entropy": 1.018626469373703, + "epoch": 1.323918799646955, + "grad_norm": 0.6402458548545837, + "learning_rate": 0.0001273589594018567, + "loss": 1.0211774826049804, + "mean_token_accuracy": 0.7581977605819702, + "num_tokens": 18175538.0, + "step": 4500 + }, + { + "epoch": 1.323918799646955, + "eval_entropy": 1.0286849455570612, + "eval_loss": 1.1943385601043701, + "eval_mean_token_accuracy": 0.7395750786375812, + "eval_num_tokens": 18175538.0, + "eval_runtime": 117.1237, + "eval_samples_per_second": 25.998, + "eval_steps_per_second": 3.253, + "step": 4500 + }, + { + "entropy": 0.9917169988155365, + "epoch": 1.3268608414239482, + "grad_norm": 0.5651717185974121, + "learning_rate": 0.00012704687945892505, + "loss": 0.9929667472839355, + "mean_token_accuracy": 0.7704780220985412, + "num_tokens": 18215849.0, + "step": 4510 + }, + { + "entropy": 1.033211100101471, + "epoch": 1.3298028832009414, + "grad_norm": 0.6076104640960693, + "learning_rate": 0.00012673451504503842, + "loss": 1.053134059906006, + "mean_token_accuracy": 0.7530263483524322, + "num_tokens": 18256385.0, + "step": 4520 + }, + { + "entropy": 0.9446504592895508, + "epoch": 1.3327449249779346, + "grad_norm": 0.6524848341941833, + "learning_rate": 0.00012642186944555186, + "loss": 0.9612871170043945, + "mean_token_accuracy": 0.7765836179256439, + "num_tokens": 18296852.0, + "step": 4530 + }, + { + "entropy": 0.95798819065094, + "epoch": 1.335686966754928, + "grad_norm": 0.5617368817329407, + "learning_rate": 0.00012610894594877788, + "loss": 0.945002555847168, + "mean_token_accuracy": 0.7801140964031219, + "num_tokens": 18337142.0, + "step": 4540 + }, + { + "entropy": 0.9328159093856812, + "epoch": 1.338629008531921, + "grad_norm": 0.5558400750160217, + "learning_rate": 0.00012579574784595188, + "loss": 0.9508322715759278, + "mean_token_accuracy": 0.7804535567760468, + "num_tokens": 18377435.0, + "step": 4550 + }, + { + "entropy": 0.9236328303813934, + "epoch": 1.3415710503089144, + "grad_norm": 0.711991548538208, + "learning_rate": 0.00012548227843119743, + "loss": 0.9464892387390137, + "mean_token_accuracy": 0.781309175491333, + "num_tokens": 18418008.0, + "step": 4560 + }, + { + "entropy": 0.9790382087230682, + "epoch": 1.3445130920859076, + "grad_norm": 0.6019552946090698, + "learning_rate": 0.00012516854100149164, + "loss": 0.9772819519042969, + "mean_token_accuracy": 0.7732869625091553, + "num_tokens": 18458454.0, + "step": 4570 + }, + { + "entropy": 0.9923386096954345, + "epoch": 1.3474551338629008, + "grad_norm": 0.5897583365440369, + "learning_rate": 0.00012485453885663063, + "loss": 0.9893753051757812, + "mean_token_accuracy": 0.7679760038852692, + "num_tokens": 18498893.0, + "step": 4580 + }, + { + "entropy": 0.975125765800476, + "epoch": 1.350397175639894, + "grad_norm": 0.5963767766952515, + "learning_rate": 0.00012454027529919462, + "loss": 0.9868227958679199, + "mean_token_accuracy": 0.7681001186370849, + "num_tokens": 18539290.0, + "step": 4590 + }, + { + "entropy": 0.9119367241859436, + "epoch": 1.3533392174168872, + "grad_norm": 0.6379720568656921, + "learning_rate": 0.00012422575363451335, + "loss": 0.9250626564025879, + "mean_token_accuracy": 0.7873954355716706, + "num_tokens": 18579755.0, + "step": 4600 + }, + { + "epoch": 1.3533392174168872, + "eval_entropy": 1.045042129717474, + "eval_loss": 1.1902120113372803, + "eval_mean_token_accuracy": 0.739525359915936, + "eval_num_tokens": 18579755.0, + "eval_runtime": 117.0159, + "eval_samples_per_second": 26.022, + "eval_steps_per_second": 3.256, + "step": 4600 + }, + { + "entropy": 0.9590096414089203, + "epoch": 1.3562812591938807, + "grad_norm": 0.6441698670387268, + "learning_rate": 0.00012391097717063117, + "loss": 0.9705069541931153, + "mean_token_accuracy": 0.772186666727066, + "num_tokens": 18620196.0, + "step": 4610 + }, + { + "entropy": 0.9852827608585357, + "epoch": 1.3592233009708738, + "grad_norm": 0.6008490920066833, + "learning_rate": 0.00012359594921827245, + "loss": 0.9953920364379882, + "mean_token_accuracy": 0.7687745451927185, + "num_tokens": 18660133.0, + "step": 4620 + }, + { + "entropy": 1.0110926747322082, + "epoch": 1.362165342747867, + "grad_norm": 0.5604993104934692, + "learning_rate": 0.00012328067309080653, + "loss": 1.0147642135620116, + "mean_token_accuracy": 0.764192932844162, + "num_tokens": 18700534.0, + "step": 4630 + }, + { + "entropy": 0.9773748695850373, + "epoch": 1.3651073845248602, + "grad_norm": 0.5655143857002258, + "learning_rate": 0.0001229651521042131, + "loss": 1.0055460929870605, + "mean_token_accuracy": 0.7645916283130646, + "num_tokens": 18741011.0, + "step": 4640 + }, + { + "entropy": 0.912693589925766, + "epoch": 1.3680494263018534, + "grad_norm": 0.5726847648620605, + "learning_rate": 0.00012264938957704707, + "loss": 0.9029450416564941, + "mean_token_accuracy": 0.7917129874229432, + "num_tokens": 18781532.0, + "step": 4650 + }, + { + "entropy": 0.9796160280704498, + "epoch": 1.3709914680788466, + "grad_norm": 0.5168410539627075, + "learning_rate": 0.00012233338883040385, + "loss": 0.994998550415039, + "mean_token_accuracy": 0.7707984328269959, + "num_tokens": 18821962.0, + "step": 4660 + }, + { + "entropy": 0.9919624567031861, + "epoch": 1.3739335098558398, + "grad_norm": 0.5887457728385925, + "learning_rate": 0.00012201715318788445, + "loss": 0.9917054176330566, + "mean_token_accuracy": 0.7673246085643768, + "num_tokens": 18862330.0, + "step": 4670 + }, + { + "entropy": 1.0523385763168336, + "epoch": 1.3768755516328333, + "grad_norm": 0.5428098440170288, + "learning_rate": 0.00012170068597556035, + "loss": 1.077983283996582, + "mean_token_accuracy": 0.753084135055542, + "num_tokens": 18902801.0, + "step": 4680 + }, + { + "entropy": 0.9840884447097779, + "epoch": 1.3798175934098265, + "grad_norm": 0.6384422183036804, + "learning_rate": 0.00012138399052193867, + "loss": 0.9722138404846191, + "mean_token_accuracy": 0.7726257860660553, + "num_tokens": 18943381.0, + "step": 4690 + }, + { + "entropy": 0.9885792315006257, + "epoch": 1.3827596351868197, + "grad_norm": 0.5873745679855347, + "learning_rate": 0.00012106707015792702, + "loss": 1.0171488761901855, + "mean_token_accuracy": 0.7622905492782592, + "num_tokens": 18983723.0, + "step": 4700 + }, + { + "epoch": 1.3827596351868197, + "eval_entropy": 1.0410449004235856, + "eval_loss": 1.1852467060089111, + "eval_mean_token_accuracy": 0.7411133794021105, + "eval_num_tokens": 18983723.0, + "eval_runtime": 117.0373, + "eval_samples_per_second": 26.017, + "eval_steps_per_second": 3.255, + "step": 4700 + }, + { + "entropy": 0.9628095984458923, + "epoch": 1.3857016769638129, + "grad_norm": 0.6936764121055603, + "learning_rate": 0.00012074992821679866, + "loss": 0.9454580307006836, + "mean_token_accuracy": 0.7766146242618561, + "num_tokens": 19024209.0, + "step": 4710 + }, + { + "entropy": 0.9361280083656311, + "epoch": 1.388643718740806, + "grad_norm": 0.6228808164596558, + "learning_rate": 0.00012043256803415723, + "loss": 0.9670154571533203, + "mean_token_accuracy": 0.7755923092365264, + "num_tokens": 19064478.0, + "step": 4720 + }, + { + "entropy": 0.9650130271911621, + "epoch": 1.3915857605177995, + "grad_norm": 0.6201843619346619, + "learning_rate": 0.00012011499294790188, + "loss": 0.9677058219909668, + "mean_token_accuracy": 0.7731058478355408, + "num_tokens": 19104877.0, + "step": 4730 + }, + { + "entropy": 0.9925507187843323, + "epoch": 1.3945278022947925, + "grad_norm": 0.6081472635269165, + "learning_rate": 0.00011979720629819195, + "loss": 0.9994287490844727, + "mean_token_accuracy": 0.7684627115726471, + "num_tokens": 19145110.0, + "step": 4740 + }, + { + "entropy": 0.9522078454494476, + "epoch": 1.3974698440717859, + "grad_norm": 0.5446240305900574, + "learning_rate": 0.00011947921142741197, + "loss": 0.9563077926635742, + "mean_token_accuracy": 0.7776003420352936, + "num_tokens": 19185735.0, + "step": 4750 + }, + { + "entropy": 0.9502395629882813, + "epoch": 1.400411885848779, + "grad_norm": 0.6443742513656616, + "learning_rate": 0.00011916101168013649, + "loss": 0.9714900016784668, + "mean_token_accuracy": 0.7751095175743103, + "num_tokens": 19226033.0, + "step": 4760 + }, + { + "entropy": 0.9543413400650025, + "epoch": 1.4033539276257723, + "grad_norm": 0.60639488697052, + "learning_rate": 0.0001188426104030949, + "loss": 0.9422737121582031, + "mean_token_accuracy": 0.77926025390625, + "num_tokens": 19266072.0, + "step": 4770 + }, + { + "entropy": 0.8988826811313629, + "epoch": 1.4062959694027655, + "grad_norm": 0.6495632529258728, + "learning_rate": 0.00011852401094513621, + "loss": 0.9340031623840332, + "mean_token_accuracy": 0.7852738976478577, + "num_tokens": 19306236.0, + "step": 4780 + }, + { + "entropy": 1.0078811585903167, + "epoch": 1.4092380111797587, + "grad_norm": 0.6039602160453796, + "learning_rate": 0.00011820521665719377, + "loss": 1.0128409385681152, + "mean_token_accuracy": 0.7650022029876709, + "num_tokens": 19346514.0, + "step": 4790 + }, + { + "entropy": 1.002649539709091, + "epoch": 1.412180052956752, + "grad_norm": 0.5959292054176331, + "learning_rate": 0.00011788623089225024, + "loss": 0.9958960533142089, + "mean_token_accuracy": 0.7670526385307312, + "num_tokens": 19387036.0, + "step": 4800 + }, + { + "epoch": 1.412180052956752, + "eval_entropy": 1.0362290414612436, + "eval_loss": 1.1831785440444946, + "eval_mean_token_accuracy": 0.741490173058247, + "eval_num_tokens": 19387036.0, + "eval_runtime": 116.8979, + "eval_samples_per_second": 26.048, + "eval_steps_per_second": 3.259, + "step": 4800 + }, + { + "entropy": 0.9328351199626923, + "epoch": 1.4151220947337453, + "grad_norm": 0.5852963924407959, + "learning_rate": 0.00011756705700530206, + "loss": 0.9353754043579101, + "mean_token_accuracy": 0.7827379524707794, + "num_tokens": 19427397.0, + "step": 4810 + }, + { + "entropy": 0.9303468823432922, + "epoch": 1.4180641365107385, + "grad_norm": 0.5715077519416809, + "learning_rate": 0.0001172476983533243, + "loss": 0.9525899887084961, + "mean_token_accuracy": 0.7789464175701142, + "num_tokens": 19467357.0, + "step": 4820 + }, + { + "entropy": 0.9718841493129731, + "epoch": 1.4210061782877317, + "grad_norm": 0.5573017597198486, + "learning_rate": 0.00011692815829523536, + "loss": 0.974174690246582, + "mean_token_accuracy": 0.7711581230163574, + "num_tokens": 19507831.0, + "step": 4830 + }, + { + "entropy": 1.0064192593097687, + "epoch": 1.4239482200647249, + "grad_norm": 0.5526002645492554, + "learning_rate": 0.00011660844019186159, + "loss": 1.024794101715088, + "mean_token_accuracy": 0.7663941740989685, + "num_tokens": 19548408.0, + "step": 4840 + }, + { + "entropy": 1.0319006383419036, + "epoch": 1.426890261841718, + "grad_norm": 0.5624271035194397, + "learning_rate": 0.000116288547405902, + "loss": 1.0340099334716797, + "mean_token_accuracy": 0.760799127817154, + "num_tokens": 19588730.0, + "step": 4850 + }, + { + "entropy": 0.9651375532150268, + "epoch": 1.4298323036187113, + "grad_norm": 0.5835928320884705, + "learning_rate": 0.00011596848330189282, + "loss": 0.9745287895202637, + "mean_token_accuracy": 0.770064502954483, + "num_tokens": 19629242.0, + "step": 4860 + }, + { + "entropy": 0.9604414582252503, + "epoch": 1.4327743453957047, + "grad_norm": 0.6139530539512634, + "learning_rate": 0.00011564825124617218, + "loss": 0.967037582397461, + "mean_token_accuracy": 0.7793013870716095, + "num_tokens": 19669879.0, + "step": 4870 + }, + { + "entropy": 0.9346582233905792, + "epoch": 1.435716387172698, + "grad_norm": 0.6224908232688904, + "learning_rate": 0.00011532785460684466, + "loss": 0.9508832931518555, + "mean_token_accuracy": 0.7788917005062104, + "num_tokens": 19710258.0, + "step": 4880 + }, + { + "entropy": 0.9697970807552337, + "epoch": 1.438658428949691, + "grad_norm": 0.6372175216674805, + "learning_rate": 0.00011500729675374589, + "loss": 0.9690608978271484, + "mean_token_accuracy": 0.773440134525299, + "num_tokens": 19750412.0, + "step": 4890 + }, + { + "entropy": 0.9443894863128662, + "epoch": 1.4416004707266843, + "grad_norm": 0.6615795493125916, + "learning_rate": 0.00011468658105840706, + "loss": 0.9526325225830078, + "mean_token_accuracy": 0.7742028653621673, + "num_tokens": 19790931.0, + "step": 4900 + }, + { + "epoch": 1.4416004707266843, + "eval_entropy": 1.0257837637828717, + "eval_loss": 1.1847437620162964, + "eval_mean_token_accuracy": 0.7422285342779685, + "eval_num_tokens": 19790931.0, + "eval_runtime": 117.0067, + "eval_samples_per_second": 26.024, + "eval_steps_per_second": 3.256, + "step": 4900 + }, + { + "entropy": 0.9378713190555572, + "epoch": 1.4445425125036775, + "grad_norm": 0.5865938663482666, + "learning_rate": 0.0001143657108940196, + "loss": 0.9491632461547852, + "mean_token_accuracy": 0.7826192319393158, + "num_tokens": 19830998.0, + "step": 4910 + }, + { + "entropy": 0.8981156468391418, + "epoch": 1.447484554280671, + "grad_norm": 0.5529859066009521, + "learning_rate": 0.00011404468963539945, + "loss": 0.9059307098388671, + "mean_token_accuracy": 0.7881741523742676, + "num_tokens": 19871496.0, + "step": 4920 + }, + { + "entropy": 0.9764446496963501, + "epoch": 1.4504265960576639, + "grad_norm": 0.5437819361686707, + "learning_rate": 0.00011372352065895185, + "loss": 0.9850223541259766, + "mean_token_accuracy": 0.7667160212993622, + "num_tokens": 19912139.0, + "step": 4930 + }, + { + "entropy": 1.009915566444397, + "epoch": 1.4533686378346573, + "grad_norm": 0.48672956228256226, + "learning_rate": 0.00011340220734263562, + "loss": 1.009783935546875, + "mean_token_accuracy": 0.765262508392334, + "num_tokens": 19952597.0, + "step": 4940 + }, + { + "entropy": 0.9249416530132294, + "epoch": 1.4563106796116505, + "grad_norm": 0.5625399351119995, + "learning_rate": 0.00011308075306592771, + "loss": 0.9417881011962891, + "mean_token_accuracy": 0.7811116933822632, + "num_tokens": 19992805.0, + "step": 4950 + }, + { + "entropy": 0.9421619713306427, + "epoch": 1.4592527213886437, + "grad_norm": 0.5547005534172058, + "learning_rate": 0.00011275916120978769, + "loss": 0.9452463150024414, + "mean_token_accuracy": 0.7800073266029358, + "num_tokens": 20033035.0, + "step": 4960 + }, + { + "entropy": 0.9370833516120911, + "epoch": 1.462194763165637, + "grad_norm": 0.6140998601913452, + "learning_rate": 0.00011243743515662209, + "loss": 0.9688581466674805, + "mean_token_accuracy": 0.7777487993240356, + "num_tokens": 20073484.0, + "step": 4970 + }, + { + "entropy": 1.0171671450138091, + "epoch": 1.46513680494263, + "grad_norm": 0.6281487345695496, + "learning_rate": 0.00011211557829024892, + "loss": 0.9987648963928223, + "mean_token_accuracy": 0.7690559566020966, + "num_tokens": 20113884.0, + "step": 4980 + }, + { + "entropy": 0.9113238871097564, + "epoch": 1.4680788467196235, + "grad_norm": 0.5871062278747559, + "learning_rate": 0.00011179359399586202, + "loss": 0.9162681579589844, + "mean_token_accuracy": 0.7859906852245331, + "num_tokens": 20154035.0, + "step": 4990 + }, + { + "entropy": 0.9293498694896698, + "epoch": 1.4710208884966167, + "grad_norm": 0.5837708115577698, + "learning_rate": 0.00011147148565999553, + "loss": 0.9455188751220703, + "mean_token_accuracy": 0.7774161994457245, + "num_tokens": 20194398.0, + "step": 5000 + }, + { + "epoch": 1.4710208884966167, + "eval_entropy": 1.0215178381583196, + "eval_loss": 1.182082176208496, + "eval_mean_token_accuracy": 0.7424087036313034, + "eval_num_tokens": 20194398.0, + "eval_runtime": 116.9583, + "eval_samples_per_second": 26.035, + "eval_steps_per_second": 3.258, + "step": 5000 + }, + { + "entropy": 0.8972196578979492, + "epoch": 1.47396293027361, + "grad_norm": 0.6664556264877319, + "learning_rate": 0.00011114925667048814, + "loss": 0.897000789642334, + "mean_token_accuracy": 0.7910800576210022, + "num_tokens": 20234842.0, + "step": 5010 + }, + { + "entropy": 0.9366280138492584, + "epoch": 1.4769049720506031, + "grad_norm": 0.5105628967285156, + "learning_rate": 0.00011082691041644762, + "loss": 0.9634222984313965, + "mean_token_accuracy": 0.7767743766307831, + "num_tokens": 20275050.0, + "step": 5020 + }, + { + "entropy": 1.0160198926925659, + "epoch": 1.4798470138275963, + "grad_norm": 0.6369785666465759, + "learning_rate": 0.00011050445028821504, + "loss": 1.0192986488342286, + "mean_token_accuracy": 0.763582181930542, + "num_tokens": 20315449.0, + "step": 5030 + }, + { + "entropy": 1.007281619310379, + "epoch": 1.4827890556045895, + "grad_norm": 0.6209942698478699, + "learning_rate": 0.00011018187967732918, + "loss": 0.9973898887634277, + "mean_token_accuracy": 0.7695520281791687, + "num_tokens": 20355896.0, + "step": 5040 + }, + { + "entropy": 0.9635446310043335, + "epoch": 1.4857310973815827, + "grad_norm": 0.6402096748352051, + "learning_rate": 0.00010985920197649086, + "loss": 1.0030365943908692, + "mean_token_accuracy": 0.7655979931354523, + "num_tokens": 20396451.0, + "step": 5050 + }, + { + "entropy": 0.9760343492031097, + "epoch": 1.4886731391585761, + "grad_norm": 0.658566415309906, + "learning_rate": 0.00010953642057952722, + "loss": 0.9697424888610839, + "mean_token_accuracy": 0.7760293245315552, + "num_tokens": 20436615.0, + "step": 5060 + }, + { + "entropy": 0.9795664429664612, + "epoch": 1.4916151809355693, + "grad_norm": 0.5999899506568909, + "learning_rate": 0.00010921353888135605, + "loss": 0.9814806938171386, + "mean_token_accuracy": 0.7703676760196686, + "num_tokens": 20477070.0, + "step": 5070 + }, + { + "entropy": 0.9684806585311889, + "epoch": 1.4945572227125625, + "grad_norm": 0.6002531051635742, + "learning_rate": 0.00010889056027795009, + "loss": 0.9861810684204102, + "mean_token_accuracy": 0.7723352909088135, + "num_tokens": 20517284.0, + "step": 5080 + }, + { + "entropy": 0.9738320171833038, + "epoch": 1.4974992644895557, + "grad_norm": 0.6089113354682922, + "learning_rate": 0.00010856748816630127, + "loss": 0.984062385559082, + "mean_token_accuracy": 0.772194218635559, + "num_tokens": 20557266.0, + "step": 5090 + }, + { + "entropy": 0.9736992299556733, + "epoch": 1.500441306266549, + "grad_norm": 0.5777798295021057, + "learning_rate": 0.00010824432594438505, + "loss": 0.9862478256225586, + "mean_token_accuracy": 0.7714533090591431, + "num_tokens": 20597675.0, + "step": 5100 + }, + { + "epoch": 1.500441306266549, + "eval_entropy": 1.0292494011519775, + "eval_loss": 1.1772292852401733, + "eval_mean_token_accuracy": 0.7431638131304363, + "eval_num_tokens": 20597675.0, + "eval_runtime": 116.9139, + "eval_samples_per_second": 26.045, + "eval_steps_per_second": 3.259, + "step": 5100 + }, + { + "entropy": 0.902402263879776, + "epoch": 1.5033833480435423, + "grad_norm": 0.6047548055648804, + "learning_rate": 0.0001079210770111246, + "loss": 0.9013402938842774, + "mean_token_accuracy": 0.7890763878822327, + "num_tokens": 20638028.0, + "step": 5110 + }, + { + "entropy": 0.9433676958084106, + "epoch": 1.5063253898205353, + "grad_norm": 0.7120152711868286, + "learning_rate": 0.00010759774476635513, + "loss": 0.9629843711853028, + "mean_token_accuracy": 0.7767750382423401, + "num_tokens": 20677777.0, + "step": 5120 + }, + { + "entropy": 1.028594321012497, + "epoch": 1.5092674315975287, + "grad_norm": 0.5008658766746521, + "learning_rate": 0.00010727433261078808, + "loss": 1.0339035987854004, + "mean_token_accuracy": 0.757422798871994, + "num_tokens": 20718022.0, + "step": 5130 + }, + { + "entropy": 1.0165768265724182, + "epoch": 1.512209473374522, + "grad_norm": 0.5533917546272278, + "learning_rate": 0.00010695084394597537, + "loss": 1.0265631675720215, + "mean_token_accuracy": 0.7609834551811219, + "num_tokens": 20758730.0, + "step": 5140 + }, + { + "entropy": 1.011590701341629, + "epoch": 1.5151515151515151, + "grad_norm": 0.6278111934661865, + "learning_rate": 0.00010662728217427362, + "loss": 1.0116978645324708, + "mean_token_accuracy": 0.7609262108802796, + "num_tokens": 20799202.0, + "step": 5150 + }, + { + "entropy": 0.9122114419937134, + "epoch": 1.5180935569285083, + "grad_norm": 0.5556227564811707, + "learning_rate": 0.00010630365069880837, + "loss": 0.9219463348388672, + "mean_token_accuracy": 0.7841397285461426, + "num_tokens": 20839777.0, + "step": 5160 + }, + { + "entropy": 0.9289700865745545, + "epoch": 1.5210355987055015, + "grad_norm": 0.656017005443573, + "learning_rate": 0.00010597995292343827, + "loss": 0.9393370628356934, + "mean_token_accuracy": 0.7785055756568908, + "num_tokens": 20880324.0, + "step": 5170 + }, + { + "entropy": 0.9671324849128723, + "epoch": 1.523977640482495, + "grad_norm": 0.5946719646453857, + "learning_rate": 0.00010565619225271934, + "loss": 0.9726341247558594, + "mean_token_accuracy": 0.772115957736969, + "num_tokens": 20920903.0, + "step": 5180 + }, + { + "entropy": 0.9615132927894592, + "epoch": 1.526919682259488, + "grad_norm": 0.5949153304100037, + "learning_rate": 0.00010533237209186904, + "loss": 0.9622700691223145, + "mean_token_accuracy": 0.7711909949779511, + "num_tokens": 20960934.0, + "step": 5190 + }, + { + "entropy": 0.9015084564685821, + "epoch": 1.5298617240364814, + "grad_norm": 0.594940721988678, + "learning_rate": 0.00010500849584673059, + "loss": 0.9115975379943848, + "mean_token_accuracy": 0.7880061626434326, + "num_tokens": 21001387.0, + "step": 5200 + }, + { + "epoch": 1.5298617240364814, + "eval_entropy": 1.0002246155669996, + "eval_loss": 1.1774756908416748, + "eval_mean_token_accuracy": 0.7437332308511408, + "eval_num_tokens": 21001387.0, + "eval_runtime": 116.917, + "eval_samples_per_second": 26.044, + "eval_steps_per_second": 3.259, + "step": 5200 + }, + { + "entropy": 0.9508689403533935, + "epoch": 1.5328037658134746, + "grad_norm": 0.5927796959877014, + "learning_rate": 0.00010468456692373703, + "loss": 0.9860681533813477, + "mean_token_accuracy": 0.7731941938400269, + "num_tokens": 21041244.0, + "step": 5210 + }, + { + "entropy": 0.9849535644054412, + "epoch": 1.5357458075904677, + "grad_norm": 0.6247168183326721, + "learning_rate": 0.0001043605887298755, + "loss": 0.9714067459106446, + "mean_token_accuracy": 0.7714132785797119, + "num_tokens": 21081657.0, + "step": 5220 + }, + { + "entropy": 0.9713167667388916, + "epoch": 1.5386878493674612, + "grad_norm": 0.677769124507904, + "learning_rate": 0.00010403656467265138, + "loss": 0.9748648643493653, + "mean_token_accuracy": 0.7746425211429596, + "num_tokens": 21122109.0, + "step": 5230 + }, + { + "entropy": 0.9380859136581421, + "epoch": 1.5416298911444541, + "grad_norm": 0.6017981767654419, + "learning_rate": 0.00010371249816005235, + "loss": 0.950676441192627, + "mean_token_accuracy": 0.7762335240840912, + "num_tokens": 21162652.0, + "step": 5240 + }, + { + "entropy": 0.9800868451595306, + "epoch": 1.5445719329214476, + "grad_norm": 0.6067262291908264, + "learning_rate": 0.00010338839260051265, + "loss": 0.9775652885437012, + "mean_token_accuracy": 0.7736145675182342, + "num_tokens": 21203018.0, + "step": 5250 + }, + { + "entropy": 0.9683707654476166, + "epoch": 1.5475139746984408, + "grad_norm": 0.5600547790527344, + "learning_rate": 0.00010306425140287724, + "loss": 0.9947422027587891, + "mean_token_accuracy": 0.768592232465744, + "num_tokens": 21243193.0, + "step": 5260 + }, + { + "entropy": 0.9973979473114014, + "epoch": 1.550456016475434, + "grad_norm": 0.6263974905014038, + "learning_rate": 0.00010274007797636589, + "loss": 1.0072562217712402, + "mean_token_accuracy": 0.7686869263648987, + "num_tokens": 21283229.0, + "step": 5270 + }, + { + "entropy": 0.9464682042598724, + "epoch": 1.5533980582524272, + "grad_norm": 0.6058053970336914, + "learning_rate": 0.00010241587573053732, + "loss": 0.9497817993164063, + "mean_token_accuracy": 0.7785175144672394, + "num_tokens": 21323783.0, + "step": 5280 + }, + { + "entropy": 1.0010394990444182, + "epoch": 1.5563401000294204, + "grad_norm": 0.6609871983528137, + "learning_rate": 0.0001020916480752534, + "loss": 1.0204454421997071, + "mean_token_accuracy": 0.761556738615036, + "num_tokens": 21364201.0, + "step": 5290 + }, + { + "entropy": 0.9770765423774719, + "epoch": 1.5592821418064138, + "grad_norm": 0.5533031225204468, + "learning_rate": 0.00010176739842064323, + "loss": 0.9723864555358886, + "mean_token_accuracy": 0.7763941168785096, + "num_tokens": 21404698.0, + "step": 5300 + }, + { + "epoch": 1.5592821418064138, + "eval_entropy": 1.0454554280896826, + "eval_loss": 1.168828010559082, + "eval_mean_token_accuracy": 0.744419585844976, + "eval_num_tokens": 21404698.0, + "eval_runtime": 116.9992, + "eval_samples_per_second": 26.026, + "eval_steps_per_second": 3.256, + "step": 5300 + }, + { + "entropy": 0.9656676173210144, + "epoch": 1.5622241835834068, + "grad_norm": 0.6528343558311462, + "learning_rate": 0.00010144313017706726, + "loss": 0.9678720474243164, + "mean_token_accuracy": 0.779698771238327, + "num_tokens": 21445187.0, + "step": 5310 + }, + { + "entropy": 0.9781722486019134, + "epoch": 1.5651662253604002, + "grad_norm": 0.5709927678108215, + "learning_rate": 0.00010111884675508151, + "loss": 1.010976505279541, + "mean_token_accuracy": 0.7641195952892303, + "num_tokens": 21485680.0, + "step": 5320 + }, + { + "entropy": 0.9678059935569763, + "epoch": 1.5681082671373934, + "grad_norm": 0.5990722179412842, + "learning_rate": 0.00010079455156540163, + "loss": 0.9739880561828613, + "mean_token_accuracy": 0.777262145280838, + "num_tokens": 21526255.0, + "step": 5330 + }, + { + "entropy": 0.934472793340683, + "epoch": 1.5710503089143866, + "grad_norm": 0.5266041159629822, + "learning_rate": 0.00010047024801886702, + "loss": 0.9320767402648926, + "mean_token_accuracy": 0.7839205145835877, + "num_tokens": 21566600.0, + "step": 5340 + }, + { + "entropy": 0.9387928783893585, + "epoch": 1.5739923506913798, + "grad_norm": 0.5499687790870667, + "learning_rate": 0.00010014593952640494, + "loss": 0.9497169494628906, + "mean_token_accuracy": 0.7800655484199523, + "num_tokens": 21607156.0, + "step": 5350 + }, + { + "entropy": 1.0074778258800507, + "epoch": 1.576934392468373, + "grad_norm": 0.6578675508499146, + "learning_rate": 9.982162949899479e-05, + "loss": 1.020614242553711, + "mean_token_accuracy": 0.765311861038208, + "num_tokens": 21647444.0, + "step": 5360 + }, + { + "entropy": 0.9886070728302002, + "epoch": 1.5798764342453664, + "grad_norm": 0.6587820053100586, + "learning_rate": 9.949732134763199e-05, + "loss": 0.993044662475586, + "mean_token_accuracy": 0.7681563913822174, + "num_tokens": 21688001.0, + "step": 5370 + }, + { + "entropy": 0.899987381696701, + "epoch": 1.5828184760223594, + "grad_norm": 0.5484562516212463, + "learning_rate": 9.917301848329231e-05, + "loss": 0.9030593872070313, + "mean_token_accuracy": 0.7870559990406036, + "num_tokens": 21727882.0, + "step": 5380 + }, + { + "entropy": 0.9620799243450164, + "epoch": 1.5857605177993528, + "grad_norm": 0.619149386882782, + "learning_rate": 9.884872431689581e-05, + "loss": 0.9912420272827148, + "mean_token_accuracy": 0.7723720014095307, + "num_tokens": 21767753.0, + "step": 5390 + }, + { + "entropy": 1.000234466791153, + "epoch": 1.588702559576346, + "grad_norm": 0.5575194954872131, + "learning_rate": 9.852444225927122e-05, + "loss": 0.9978320121765136, + "mean_token_accuracy": 0.7687974095344543, + "num_tokens": 21808266.0, + "step": 5400 + }, + { + "epoch": 1.588702559576346, + "eval_entropy": 1.0398932406752128, + "eval_loss": 1.1691675186157227, + "eval_mean_token_accuracy": 0.7446287646694133, + "eval_num_tokens": 21808266.0, + "eval_runtime": 116.9348, + "eval_samples_per_second": 26.04, + "eval_steps_per_second": 3.258, + "step": 5400 + }, + { + "entropy": 0.9786124050617218, + "epoch": 1.5916446013533392, + "grad_norm": 0.5668993592262268, + "learning_rate": 9.820017572111973e-05, + "loss": 0.9736597061157226, + "mean_token_accuracy": 0.768933230638504, + "num_tokens": 21848699.0, + "step": 5410 + }, + { + "entropy": 0.9540457367897034, + "epoch": 1.5945866431303326, + "grad_norm": 0.5640490651130676, + "learning_rate": 9.787592811297946e-05, + "loss": 0.9902207374572753, + "mean_token_accuracy": 0.7725074052810669, + "num_tokens": 21888992.0, + "step": 5420 + }, + { + "entropy": 1.0212572634220123, + "epoch": 1.5975286849073256, + "grad_norm": 0.5303104519844055, + "learning_rate": 9.755170284518941e-05, + "loss": 1.0194875717163085, + "mean_token_accuracy": 0.7608138382434845, + "num_tokens": 21929626.0, + "step": 5430 + }, + { + "entropy": 0.9701859831809998, + "epoch": 1.600470726684319, + "grad_norm": 0.5962478518486023, + "learning_rate": 9.722750332785349e-05, + "loss": 0.9606605529785156, + "mean_token_accuracy": 0.7774016797542572, + "num_tokens": 21970234.0, + "step": 5440 + }, + { + "entropy": 0.9712904334068299, + "epoch": 1.6034127684613122, + "grad_norm": 0.6927991509437561, + "learning_rate": 9.690333297080493e-05, + "loss": 0.9966094970703125, + "mean_token_accuracy": 0.7730933606624604, + "num_tokens": 22010748.0, + "step": 5450 + }, + { + "entropy": 0.9969470083713532, + "epoch": 1.6063548102383054, + "grad_norm": 0.48069873452186584, + "learning_rate": 9.657919518357008e-05, + "loss": 1.0035072326660157, + "mean_token_accuracy": 0.7701967000961304, + "num_tokens": 22051287.0, + "step": 5460 + }, + { + "entropy": 0.9618437588214874, + "epoch": 1.6092968520152986, + "grad_norm": 0.5836319923400879, + "learning_rate": 9.625509337533296e-05, + "loss": 0.9715272903442382, + "mean_token_accuracy": 0.7741812229156494, + "num_tokens": 22091789.0, + "step": 5470 + }, + { + "entropy": 0.92412930727005, + "epoch": 1.6122388937922918, + "grad_norm": 0.5578837990760803, + "learning_rate": 9.593103095489895e-05, + "loss": 0.9371700286865234, + "mean_token_accuracy": 0.780053973197937, + "num_tokens": 22132202.0, + "step": 5480 + }, + { + "entropy": 0.9944902658462524, + "epoch": 1.6151809355692852, + "grad_norm": 0.6106790900230408, + "learning_rate": 9.560701133065932e-05, + "loss": 0.9925059318542481, + "mean_token_accuracy": 0.771945059299469, + "num_tokens": 22172268.0, + "step": 5490 + }, + { + "entropy": 0.9864640951156616, + "epoch": 1.6181229773462782, + "grad_norm": 0.6734248399734497, + "learning_rate": 9.528303791055511e-05, + "loss": 1.001873779296875, + "mean_token_accuracy": 0.7679209113121033, + "num_tokens": 22212768.0, + "step": 5500 + }, + { + "epoch": 1.6181229773462782, + "eval_entropy": 1.017620304985622, + "eval_loss": 1.1692047119140625, + "eval_mean_token_accuracy": 0.7452891367314056, + "eval_num_tokens": 22212768.0, + "eval_runtime": 116.9536, + "eval_samples_per_second": 26.036, + "eval_steps_per_second": 3.258, + "step": 5500 + } + ], + "logging_steps": 10, + "max_steps": 10197, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0319598971846e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}