diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4914 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9991386735572783, + "eval_steps": 100, + "global_step": 3045, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004921865386981666, + "grad_norm": 10.908417701721191, + "learning_rate": 3.278688524590164e-07, + "loss": 2.6851, + "mean_token_accuracy": 0.490550322830677, + "step": 5 + }, + { + "epoch": 0.009843730773963333, + "grad_norm": 10.821477890014648, + "learning_rate": 6.557377049180328e-07, + "loss": 2.6916, + "mean_token_accuracy": 0.4892874449491501, + "step": 10 + }, + { + "epoch": 0.014765596160944998, + "grad_norm": 9.100831031799316, + "learning_rate": 9.836065573770493e-07, + "loss": 2.6563, + "mean_token_accuracy": 0.49268135130405427, + "step": 15 + }, + { + "epoch": 0.019687461547926666, + "grad_norm": 6.744043827056885, + "learning_rate": 1.3114754098360657e-06, + "loss": 2.4838, + "mean_token_accuracy": 0.503991749882698, + "step": 20 + }, + { + "epoch": 0.02460932693490833, + "grad_norm": 4.111428737640381, + "learning_rate": 1.6393442622950819e-06, + "loss": 2.3481, + "mean_token_accuracy": 0.5121142826974392, + "step": 25 + }, + { + "epoch": 0.029531192321889995, + "grad_norm": 3.504826068878174, + "learning_rate": 1.9672131147540985e-06, + "loss": 2.1834, + "mean_token_accuracy": 0.525759468972683, + "step": 30 + }, + { + "epoch": 0.034453057708871665, + "grad_norm": 2.371668577194214, + "learning_rate": 2.295081967213115e-06, + "loss": 1.9992, + "mean_token_accuracy": 0.5471328645944595, + "step": 35 + }, + { + "epoch": 0.03937492309585333, + "grad_norm": 1.910736083984375, + "learning_rate": 2.6229508196721314e-06, + "loss": 1.8619, + "mean_token_accuracy": 0.5657269343733787, + "step": 40 + }, + { + "epoch": 0.044296788482835, + "grad_norm": 1.6694586277008057, + "learning_rate": 2.9508196721311478e-06, + "loss": 1.7324, + "mean_token_accuracy": 0.582801228761673, + "step": 45 + }, + { + "epoch": 0.04921865386981666, + "grad_norm": 1.3371120691299438, + "learning_rate": 3.2786885245901638e-06, + "loss": 1.5922, + "mean_token_accuracy": 0.6066210582852364, + "step": 50 + }, + { + "epoch": 0.054140519256798324, + "grad_norm": 1.153715968132019, + "learning_rate": 3.6065573770491806e-06, + "loss": 1.4607, + "mean_token_accuracy": 0.629358272254467, + "step": 55 + }, + { + "epoch": 0.05906238464377999, + "grad_norm": 1.011682391166687, + "learning_rate": 3.934426229508197e-06, + "loss": 1.3312, + "mean_token_accuracy": 0.6534152328968048, + "step": 60 + }, + { + "epoch": 0.06398425003076166, + "grad_norm": 0.8580278158187866, + "learning_rate": 4.2622950819672135e-06, + "loss": 1.2163, + "mean_token_accuracy": 0.676006656885147, + "step": 65 + }, + { + "epoch": 0.06890611541774333, + "grad_norm": 0.7737818360328674, + "learning_rate": 4.59016393442623e-06, + "loss": 1.1256, + "mean_token_accuracy": 0.695121419429779, + "step": 70 + }, + { + "epoch": 0.073827980804725, + "grad_norm": 0.6026164889335632, + "learning_rate": 4.918032786885246e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.7120692700147628, + "step": 75 + }, + { + "epoch": 0.07874984619170666, + "grad_norm": 20.797266006469727, + "learning_rate": 5.245901639344263e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7246918171644211, + "step": 80 + }, + { + "epoch": 0.08367171157868833, + "grad_norm": 24.53761100769043, + "learning_rate": 5.573770491803278e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7344574183225632, + "step": 85 + }, + { + "epoch": 0.08859357696567, + "grad_norm": 7.69836950302124, + "learning_rate": 5.9016393442622956e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7384938269853591, + "step": 90 + }, + { + "epoch": 0.09351544235265165, + "grad_norm": 0.42971891164779663, + "learning_rate": 6.229508196721312e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.743149445950985, + "step": 95 + }, + { + "epoch": 0.09843730773963331, + "grad_norm": 0.4011496901512146, + "learning_rate": 6.5573770491803276e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7489838138222694, + "step": 100 + }, + { + "epoch": 0.10335917312661498, + "grad_norm": 0.4182426631450653, + "learning_rate": 6.885245901639345e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7533508613705635, + "step": 105 + }, + { + "epoch": 0.10828103851359665, + "grad_norm": 0.4418739080429077, + "learning_rate": 7.213114754098361e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7571793958544731, + "step": 110 + }, + { + "epoch": 0.11320290390057831, + "grad_norm": 4.76384973526001, + "learning_rate": 7.540983606557377e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7560782924294471, + "step": 115 + }, + { + "epoch": 0.11812476928755998, + "grad_norm": 0.426782488822937, + "learning_rate": 7.868852459016394e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7621309965848923, + "step": 120 + }, + { + "epoch": 0.12304663467454165, + "grad_norm": 3.5404343605041504, + "learning_rate": 8.19672131147541e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7624999329447746, + "step": 125 + }, + { + "epoch": 0.12796850006152333, + "grad_norm": 0.6128109097480774, + "learning_rate": 8.524590163934427e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7650709196925163, + "step": 130 + }, + { + "epoch": 0.132890365448505, + "grad_norm": 0.4441392719745636, + "learning_rate": 8.852459016393443e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7635303542017937, + "step": 135 + }, + { + "epoch": 0.13781223083548666, + "grad_norm": 0.6959536075592041, + "learning_rate": 9.18032786885246e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7682553365826607, + "step": 140 + }, + { + "epoch": 0.14273409622246833, + "grad_norm": 0.4633159935474396, + "learning_rate": 9.508196721311476e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7677757993340493, + "step": 145 + }, + { + "epoch": 0.14765596160945, + "grad_norm": 0.3808494806289673, + "learning_rate": 9.836065573770493e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7682796508073807, + "step": 150 + }, + { + "epoch": 0.15257782699643166, + "grad_norm": 1.2230223417282104, + "learning_rate": 1.0163934426229509e-05, + "loss": 0.7714, + "mean_token_accuracy": 0.7741705477237701, + "step": 155 + }, + { + "epoch": 0.15749969238341333, + "grad_norm": 1.2708261013031006, + "learning_rate": 1.0491803278688525e-05, + "loss": 0.7671, + "mean_token_accuracy": 0.7750522747635842, + "step": 160 + }, + { + "epoch": 0.162421557770395, + "grad_norm": 0.4153311252593994, + "learning_rate": 1.0819672131147544e-05, + "loss": 0.762, + "mean_token_accuracy": 0.776003035902977, + "step": 165 + }, + { + "epoch": 0.16734342315737666, + "grad_norm": 0.48690149188041687, + "learning_rate": 1.1147540983606557e-05, + "loss": 0.7611, + "mean_token_accuracy": 0.776053948700428, + "step": 170 + }, + { + "epoch": 0.17226528854435832, + "grad_norm": 0.3839600682258606, + "learning_rate": 1.1475409836065575e-05, + "loss": 0.7518, + "mean_token_accuracy": 0.7784286484122276, + "step": 175 + }, + { + "epoch": 0.17718715393134, + "grad_norm": 0.33650702238082886, + "learning_rate": 1.1803278688524591e-05, + "loss": 0.7425, + "mean_token_accuracy": 0.7807790979743003, + "step": 180 + }, + { + "epoch": 0.18210901931832166, + "grad_norm": 0.34878674149513245, + "learning_rate": 1.2131147540983608e-05, + "loss": 0.7469, + "mean_token_accuracy": 0.779270826280117, + "step": 185 + }, + { + "epoch": 0.1870308847053033, + "grad_norm": 0.4435058534145355, + "learning_rate": 1.2459016393442624e-05, + "loss": 0.7414, + "mean_token_accuracy": 0.7804962411522866, + "step": 190 + }, + { + "epoch": 0.19195275009228496, + "grad_norm": 0.34793269634246826, + "learning_rate": 1.2786885245901642e-05, + "loss": 0.7368, + "mean_token_accuracy": 0.7817707493901253, + "step": 195 + }, + { + "epoch": 0.19687461547926663, + "grad_norm": 0.32821062207221985, + "learning_rate": 1.3114754098360655e-05, + "loss": 0.7309, + "mean_token_accuracy": 0.7830819576978684, + "step": 200 + }, + { + "epoch": 0.2017964808662483, + "grad_norm": 0.3908160626888275, + "learning_rate": 1.3442622950819673e-05, + "loss": 0.7349, + "mean_token_accuracy": 0.7820746794342994, + "step": 205 + }, + { + "epoch": 0.20671834625322996, + "grad_norm": 1.239039659500122, + "learning_rate": 1.377049180327869e-05, + "loss": 0.7315, + "mean_token_accuracy": 0.7830250725150109, + "step": 210 + }, + { + "epoch": 0.21164021164021163, + "grad_norm": 0.437558650970459, + "learning_rate": 1.4098360655737706e-05, + "loss": 0.7213, + "mean_token_accuracy": 0.785545514523983, + "step": 215 + }, + { + "epoch": 0.2165620770271933, + "grad_norm": 0.3581276535987854, + "learning_rate": 1.4426229508196722e-05, + "loss": 0.7156, + "mean_token_accuracy": 0.7868386089801789, + "step": 220 + }, + { + "epoch": 0.22148394241417496, + "grad_norm": 0.393839031457901, + "learning_rate": 1.4754098360655739e-05, + "loss": 0.7108, + "mean_token_accuracy": 0.7875275865197182, + "step": 225 + }, + { + "epoch": 0.22640580780115663, + "grad_norm": 0.4203226566314697, + "learning_rate": 1.5081967213114754e-05, + "loss": 0.7115, + "mean_token_accuracy": 0.7875282734632492, + "step": 230 + }, + { + "epoch": 0.2313276731881383, + "grad_norm": 0.4379311501979828, + "learning_rate": 1.5409836065573772e-05, + "loss": 0.7176, + "mean_token_accuracy": 0.7859495177865028, + "step": 235 + }, + { + "epoch": 0.23624953857511996, + "grad_norm": 0.5987364053726196, + "learning_rate": 1.5737704918032788e-05, + "loss": 0.7047, + "mean_token_accuracy": 0.7892461016774177, + "step": 240 + }, + { + "epoch": 0.24117140396210163, + "grad_norm": 0.39721059799194336, + "learning_rate": 1.6065573770491805e-05, + "loss": 0.7082, + "mean_token_accuracy": 0.7879156336188317, + "step": 245 + }, + { + "epoch": 0.2460932693490833, + "grad_norm": 0.35150638222694397, + "learning_rate": 1.639344262295082e-05, + "loss": 0.7015, + "mean_token_accuracy": 0.7899731829762459, + "step": 250 + }, + { + "epoch": 0.25101513473606496, + "grad_norm": 0.37812677025794983, + "learning_rate": 1.6721311475409837e-05, + "loss": 0.7112, + "mean_token_accuracy": 0.7869908154010773, + "step": 255 + }, + { + "epoch": 0.25593700012304665, + "grad_norm": 0.37921008467674255, + "learning_rate": 1.7049180327868854e-05, + "loss": 0.695, + "mean_token_accuracy": 0.7912393018603325, + "step": 260 + }, + { + "epoch": 0.2608588655100283, + "grad_norm": 0.3776193857192993, + "learning_rate": 1.737704918032787e-05, + "loss": 0.6975, + "mean_token_accuracy": 0.7903847828507423, + "step": 265 + }, + { + "epoch": 0.26578073089701, + "grad_norm": 0.34160885214805603, + "learning_rate": 1.7704918032786887e-05, + "loss": 0.7005, + "mean_token_accuracy": 0.7901133581995964, + "step": 270 + }, + { + "epoch": 0.2707025962839916, + "grad_norm": 0.3151760399341583, + "learning_rate": 1.8032786885245903e-05, + "loss": 0.6838, + "mean_token_accuracy": 0.7940751999616623, + "step": 275 + }, + { + "epoch": 0.2756244616709733, + "grad_norm": 0.3251655101776123, + "learning_rate": 1.836065573770492e-05, + "loss": 0.683, + "mean_token_accuracy": 0.7942519947886467, + "step": 280 + }, + { + "epoch": 0.28054632705795496, + "grad_norm": 0.392980694770813, + "learning_rate": 1.8688524590163936e-05, + "loss": 0.6779, + "mean_token_accuracy": 0.7953907087445259, + "step": 285 + }, + { + "epoch": 0.28546819244493665, + "grad_norm": 0.42777085304260254, + "learning_rate": 1.9016393442622952e-05, + "loss": 0.696, + "mean_token_accuracy": 0.7913835749030114, + "step": 290 + }, + { + "epoch": 0.2903900578319183, + "grad_norm": 0.38064613938331604, + "learning_rate": 1.934426229508197e-05, + "loss": 0.6777, + "mean_token_accuracy": 0.79527537971735, + "step": 295 + }, + { + "epoch": 0.2953119232189, + "grad_norm": 0.35906219482421875, + "learning_rate": 1.9672131147540985e-05, + "loss": 0.6772, + "mean_token_accuracy": 0.7954441845417023, + "step": 300 + }, + { + "epoch": 0.3002337886058816, + "grad_norm": 0.4336443543434143, + "learning_rate": 2e-05, + "loss": 0.6672, + "mean_token_accuracy": 0.7982369065284729, + "step": 305 + }, + { + "epoch": 0.3051556539928633, + "grad_norm": 0.35013464093208313, + "learning_rate": 1.9999835673561284e-05, + "loss": 0.6823, + "mean_token_accuracy": 0.7940784975886345, + "step": 310 + }, + { + "epoch": 0.31007751937984496, + "grad_norm": 0.4209573566913605, + "learning_rate": 1.9999342699645774e-05, + "loss": 0.6705, + "mean_token_accuracy": 0.7970875754952431, + "step": 315 + }, + { + "epoch": 0.31499938476682665, + "grad_norm": 0.3402932584285736, + "learning_rate": 1.9998521094455198e-05, + "loss": 0.6733, + "mean_token_accuracy": 0.7962517961859703, + "step": 320 + }, + { + "epoch": 0.3199212501538083, + "grad_norm": 0.3613898456096649, + "learning_rate": 1.9997370884991842e-05, + "loss": 0.6659, + "mean_token_accuracy": 0.7986094921827316, + "step": 325 + }, + { + "epoch": 0.32484311554079, + "grad_norm": 0.8141839504241943, + "learning_rate": 1.9995892109057675e-05, + "loss": 0.6682, + "mean_token_accuracy": 0.7979325890541077, + "step": 330 + }, + { + "epoch": 0.3297649809277716, + "grad_norm": 0.32822492718696594, + "learning_rate": 1.99940848152531e-05, + "loss": 0.6592, + "mean_token_accuracy": 0.799762362241745, + "step": 335 + }, + { + "epoch": 0.3346868463147533, + "grad_norm": 0.32193639874458313, + "learning_rate": 1.9991949062975336e-05, + "loss": 0.6669, + "mean_token_accuracy": 0.7977916583418846, + "step": 340 + }, + { + "epoch": 0.33960871170173496, + "grad_norm": 0.6516172885894775, + "learning_rate": 1.9989484922416503e-05, + "loss": 0.6636, + "mean_token_accuracy": 0.7989253982901573, + "step": 345 + }, + { + "epoch": 0.34453057708871665, + "grad_norm": 0.6252678036689758, + "learning_rate": 1.9986692474561292e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.8010424450039864, + "step": 350 + }, + { + "epoch": 0.3494524424756983, + "grad_norm": 0.39426907896995544, + "learning_rate": 1.9983571811184297e-05, + "loss": 0.6583, + "mean_token_accuracy": 0.8001298069953918, + "step": 355 + }, + { + "epoch": 0.35437430786268, + "grad_norm": 0.4398311972618103, + "learning_rate": 1.9980123034847025e-05, + "loss": 0.6569, + "mean_token_accuracy": 0.8002386093139648, + "step": 360 + }, + { + "epoch": 0.3592961732496616, + "grad_norm": 0.36181896924972534, + "learning_rate": 1.9976346258894502e-05, + "loss": 0.6572, + "mean_token_accuracy": 0.7999640181660652, + "step": 365 + }, + { + "epoch": 0.3642180386366433, + "grad_norm": 0.33937492966651917, + "learning_rate": 1.9972241607451552e-05, + "loss": 0.6534, + "mean_token_accuracy": 0.8008638471364975, + "step": 370 + }, + { + "epoch": 0.36913990402362495, + "grad_norm": 0.3220241665840149, + "learning_rate": 1.996780921541873e-05, + "loss": 0.6491, + "mean_token_accuracy": 0.8024497851729393, + "step": 375 + }, + { + "epoch": 0.3740617694106066, + "grad_norm": 0.3588990867137909, + "learning_rate": 1.9963049228467875e-05, + "loss": 0.6519, + "mean_token_accuracy": 0.8013440445065498, + "step": 380 + }, + { + "epoch": 0.3789836347975883, + "grad_norm": 0.3850741982460022, + "learning_rate": 1.9957961803037325e-05, + "loss": 0.6539, + "mean_token_accuracy": 0.8007026329636574, + "step": 385 + }, + { + "epoch": 0.3839055001845699, + "grad_norm": 0.39418673515319824, + "learning_rate": 1.9952547106326787e-05, + "loss": 0.6511, + "mean_token_accuracy": 0.8013290241360664, + "step": 390 + }, + { + "epoch": 0.3888273655715516, + "grad_norm": 0.33889254927635193, + "learning_rate": 1.9946805316291817e-05, + "loss": 0.6523, + "mean_token_accuracy": 0.8005807921290398, + "step": 395 + }, + { + "epoch": 0.39374923095853326, + "grad_norm": 0.7381798624992371, + "learning_rate": 1.9940736621638e-05, + "loss": 0.649, + "mean_token_accuracy": 0.8016207367181778, + "step": 400 + }, + { + "epoch": 0.39867109634551495, + "grad_norm": 0.3772973120212555, + "learning_rate": 1.993434122181474e-05, + "loss": 0.6458, + "mean_token_accuracy": 0.802768674492836, + "step": 405 + }, + { + "epoch": 0.4035929617324966, + "grad_norm": 0.33333730697631836, + "learning_rate": 1.992761932700868e-05, + "loss": 0.6444, + "mean_token_accuracy": 0.8025879472494125, + "step": 410 + }, + { + "epoch": 0.4085148271194783, + "grad_norm": 0.3165677785873413, + "learning_rate": 1.9920571158136837e-05, + "loss": 0.639, + "mean_token_accuracy": 0.8042329683899879, + "step": 415 + }, + { + "epoch": 0.4134366925064599, + "grad_norm": 0.3313787579536438, + "learning_rate": 1.9913196946839304e-05, + "loss": 0.6422, + "mean_token_accuracy": 0.803669148683548, + "step": 420 + }, + { + "epoch": 0.4183585578934416, + "grad_norm": 0.2832159101963043, + "learning_rate": 1.990549693547166e-05, + "loss": 0.6378, + "mean_token_accuracy": 0.8049987867474556, + "step": 425 + }, + { + "epoch": 0.42328042328042326, + "grad_norm": 0.3278089463710785, + "learning_rate": 1.9897471377096992e-05, + "loss": 0.638, + "mean_token_accuracy": 0.8043939173221588, + "step": 430 + }, + { + "epoch": 0.42820228866740495, + "grad_norm": 0.33513346314430237, + "learning_rate": 1.9889120535477584e-05, + "loss": 0.6366, + "mean_token_accuracy": 0.80514996945858, + "step": 435 + }, + { + "epoch": 0.4331241540543866, + "grad_norm": 0.36697131395339966, + "learning_rate": 1.9880444685066252e-05, + "loss": 0.6322, + "mean_token_accuracy": 0.8064638406038285, + "step": 440 + }, + { + "epoch": 0.4380460194413683, + "grad_norm": 0.34239935874938965, + "learning_rate": 1.987144411099731e-05, + "loss": 0.6328, + "mean_token_accuracy": 0.8058159291744232, + "step": 445 + }, + { + "epoch": 0.4429678848283499, + "grad_norm": 0.29778754711151123, + "learning_rate": 1.9862119109077226e-05, + "loss": 0.6442, + "mean_token_accuracy": 0.8030599504709244, + "step": 450 + }, + { + "epoch": 0.4478897502153316, + "grad_norm": 0.31139907240867615, + "learning_rate": 1.985246998577486e-05, + "loss": 0.6507, + "mean_token_accuracy": 0.8007849171757698, + "step": 455 + }, + { + "epoch": 0.45281161560231326, + "grad_norm": 0.32070034742355347, + "learning_rate": 1.984249705821143e-05, + "loss": 0.6405, + "mean_token_accuracy": 0.8038340613245964, + "step": 460 + }, + { + "epoch": 0.45773348098929495, + "grad_norm": 0.3086022734642029, + "learning_rate": 1.9832200654150077e-05, + "loss": 0.6316, + "mean_token_accuracy": 0.8058078184723854, + "step": 465 + }, + { + "epoch": 0.4626553463762766, + "grad_norm": 0.30972251296043396, + "learning_rate": 1.9821581111985072e-05, + "loss": 0.6343, + "mean_token_accuracy": 0.8051379904150963, + "step": 470 + }, + { + "epoch": 0.4675772117632583, + "grad_norm": 0.2832852005958557, + "learning_rate": 1.981063878073073e-05, + "loss": 0.6324, + "mean_token_accuracy": 0.8058837354183197, + "step": 475 + }, + { + "epoch": 0.4724990771502399, + "grad_norm": 0.909318208694458, + "learning_rate": 1.979937402000991e-05, + "loss": 0.6319, + "mean_token_accuracy": 0.8056973502039909, + "step": 480 + }, + { + "epoch": 0.4774209425372216, + "grad_norm": 0.31788304448127747, + "learning_rate": 1.9787787200042224e-05, + "loss": 0.6354, + "mean_token_accuracy": 0.8051144614815712, + "step": 485 + }, + { + "epoch": 0.48234280792420325, + "grad_norm": 0.2922450602054596, + "learning_rate": 1.977587870163184e-05, + "loss": 0.6278, + "mean_token_accuracy": 0.8066384568810463, + "step": 490 + }, + { + "epoch": 0.48726467331118495, + "grad_norm": 0.287406325340271, + "learning_rate": 1.9763648916154982e-05, + "loss": 0.6271, + "mean_token_accuracy": 0.8069956362247467, + "step": 495 + }, + { + "epoch": 0.4921865386981666, + "grad_norm": 0.34040403366088867, + "learning_rate": 1.975109824554707e-05, + "loss": 0.6288, + "mean_token_accuracy": 0.806525257229805, + "step": 500 + }, + { + "epoch": 0.4971084040851483, + "grad_norm": 0.3302447199821472, + "learning_rate": 1.973822710228951e-05, + "loss": 0.6257, + "mean_token_accuracy": 0.8072399228811264, + "step": 505 + }, + { + "epoch": 0.5020302694721299, + "grad_norm": 0.288161963224411, + "learning_rate": 1.972503590939612e-05, + "loss": 0.6234, + "mean_token_accuracy": 0.8078823387622833, + "step": 510 + }, + { + "epoch": 0.5069521348591116, + "grad_norm": 0.3387835919857025, + "learning_rate": 1.971152510039926e-05, + "loss": 0.6269, + "mean_token_accuracy": 0.8067226454615593, + "step": 515 + }, + { + "epoch": 0.5118740002460933, + "grad_norm": 0.290519118309021, + "learning_rate": 1.9697695119335547e-05, + "loss": 0.6213, + "mean_token_accuracy": 0.8083379164338111, + "step": 520 + }, + { + "epoch": 0.5167958656330749, + "grad_norm": 0.3701138496398926, + "learning_rate": 1.9683546420731292e-05, + "loss": 0.6246, + "mean_token_accuracy": 0.8079604268074035, + "step": 525 + }, + { + "epoch": 0.5217177310200566, + "grad_norm": 0.39614954590797424, + "learning_rate": 1.9669079469587548e-05, + "loss": 0.6287, + "mean_token_accuracy": 0.8067878499627114, + "step": 530 + }, + { + "epoch": 0.5266395964070383, + "grad_norm": 0.32784542441368103, + "learning_rate": 1.965429474136482e-05, + "loss": 0.6156, + "mean_token_accuracy": 0.8098407059907913, + "step": 535 + }, + { + "epoch": 0.53156146179402, + "grad_norm": 0.30213144421577454, + "learning_rate": 1.963919272196746e-05, + "loss": 0.6207, + "mean_token_accuracy": 0.8086924180388451, + "step": 540 + }, + { + "epoch": 0.5364833271810016, + "grad_norm": 0.32220178842544556, + "learning_rate": 1.9623773907727682e-05, + "loss": 0.6157, + "mean_token_accuracy": 0.8098208606243134, + "step": 545 + }, + { + "epoch": 0.5414051925679833, + "grad_norm": 0.3250666856765747, + "learning_rate": 1.9608038805389253e-05, + "loss": 0.6195, + "mean_token_accuracy": 0.8085113659501075, + "step": 550 + }, + { + "epoch": 0.546327057954965, + "grad_norm": 0.36724722385406494, + "learning_rate": 1.9591987932090836e-05, + "loss": 0.6115, + "mean_token_accuracy": 0.8109661117196083, + "step": 555 + }, + { + "epoch": 0.5512489233419466, + "grad_norm": 0.30343472957611084, + "learning_rate": 1.9575621815349e-05, + "loss": 0.6204, + "mean_token_accuracy": 0.8083494484424592, + "step": 560 + }, + { + "epoch": 0.5561707887289282, + "grad_norm": 0.3323419988155365, + "learning_rate": 1.9558940993040885e-05, + "loss": 0.6232, + "mean_token_accuracy": 0.8077159106731415, + "step": 565 + }, + { + "epoch": 0.5610926541159099, + "grad_norm": 0.31035885214805603, + "learning_rate": 1.954194601338651e-05, + "loss": 0.6157, + "mean_token_accuracy": 0.8096732005476952, + "step": 570 + }, + { + "epoch": 0.5660145195028916, + "grad_norm": 0.2931119501590729, + "learning_rate": 1.952463743493078e-05, + "loss": 0.6199, + "mean_token_accuracy": 0.808499938249588, + "step": 575 + }, + { + "epoch": 0.5709363848898733, + "grad_norm": 0.27563023567199707, + "learning_rate": 1.9507015826525096e-05, + "loss": 0.6046, + "mean_token_accuracy": 0.8128907606005669, + "step": 580 + }, + { + "epoch": 0.5758582502768549, + "grad_norm": 0.28453299403190613, + "learning_rate": 1.9489081767308696e-05, + "loss": 0.6105, + "mean_token_accuracy": 0.8113355338573456, + "step": 585 + }, + { + "epoch": 0.5807801156638366, + "grad_norm": 0.37042465806007385, + "learning_rate": 1.9470835846689596e-05, + "loss": 0.6127, + "mean_token_accuracy": 0.8106034889817237, + "step": 590 + }, + { + "epoch": 0.5857019810508183, + "grad_norm": 0.2963549792766571, + "learning_rate": 1.9452278664325227e-05, + "loss": 0.6194, + "mean_token_accuracy": 0.8086869075894356, + "step": 595 + }, + { + "epoch": 0.5906238464378, + "grad_norm": 0.2905316948890686, + "learning_rate": 1.9433410830102724e-05, + "loss": 0.61, + "mean_token_accuracy": 0.811042046546936, + "step": 600 + }, + { + "epoch": 0.5955457118247816, + "grad_norm": 0.2674277424812317, + "learning_rate": 1.9414232964118893e-05, + "loss": 0.6119, + "mean_token_accuracy": 0.8104571312665939, + "step": 605 + }, + { + "epoch": 0.6004675772117632, + "grad_norm": 0.28245261311531067, + "learning_rate": 1.939474569665981e-05, + "loss": 0.6115, + "mean_token_accuracy": 0.8106845885515213, + "step": 610 + }, + { + "epoch": 0.6053894425987449, + "grad_norm": 0.2713403105735779, + "learning_rate": 1.937494966818014e-05, + "loss": 0.6096, + "mean_token_accuracy": 0.8106750875711441, + "step": 615 + }, + { + "epoch": 0.6103113079857266, + "grad_norm": 0.31770050525665283, + "learning_rate": 1.9354845529282042e-05, + "loss": 0.6142, + "mean_token_accuracy": 0.8098479628562927, + "step": 620 + }, + { + "epoch": 0.6152331733727082, + "grad_norm": 0.28526055812835693, + "learning_rate": 1.933443394069383e-05, + "loss": 0.6062, + "mean_token_accuracy": 0.8120482847094536, + "step": 625 + }, + { + "epoch": 0.6201550387596899, + "grad_norm": 0.5695453882217407, + "learning_rate": 1.9313715573248238e-05, + "loss": 0.6122, + "mean_token_accuracy": 0.8099897101521492, + "step": 630 + }, + { + "epoch": 0.6250769041466716, + "grad_norm": 0.2738396227359772, + "learning_rate": 1.9292691107860374e-05, + "loss": 0.6031, + "mean_token_accuracy": 0.8127053424715995, + "step": 635 + }, + { + "epoch": 0.6299987695336533, + "grad_norm": 0.28948965668678284, + "learning_rate": 1.927136123550534e-05, + "loss": 0.6115, + "mean_token_accuracy": 0.8103477448225022, + "step": 640 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.27830740809440613, + "learning_rate": 1.9249726657195534e-05, + "loss": 0.608, + "mean_token_accuracy": 0.8116561621427536, + "step": 645 + }, + { + "epoch": 0.6398425003076166, + "grad_norm": 0.2712289094924927, + "learning_rate": 1.922778808395759e-05, + "loss": 0.6054, + "mean_token_accuracy": 0.8125208973884582, + "step": 650 + }, + { + "epoch": 0.6447643656945983, + "grad_norm": 0.29063907265663147, + "learning_rate": 1.9205546236809037e-05, + "loss": 0.6047, + "mean_token_accuracy": 0.8123130992054939, + "step": 655 + }, + { + "epoch": 0.64968623108158, + "grad_norm": 0.293261855840683, + "learning_rate": 1.9183001846734573e-05, + "loss": 0.603, + "mean_token_accuracy": 0.8129645109176635, + "step": 660 + }, + { + "epoch": 0.6546080964685616, + "grad_norm": 0.2849041223526001, + "learning_rate": 1.9160155654662075e-05, + "loss": 0.5926, + "mean_token_accuracy": 0.8157610684633255, + "step": 665 + }, + { + "epoch": 0.6595299618555432, + "grad_norm": 0.2975578010082245, + "learning_rate": 1.9137008411438213e-05, + "loss": 0.6034, + "mean_token_accuracy": 0.8125734269618988, + "step": 670 + }, + { + "epoch": 0.6644518272425249, + "grad_norm": 0.286842405796051, + "learning_rate": 1.9113560877803798e-05, + "loss": 0.6045, + "mean_token_accuracy": 0.8125320598483086, + "step": 675 + }, + { + "epoch": 0.6693736926295066, + "grad_norm": 0.33480602502822876, + "learning_rate": 1.9089813824368765e-05, + "loss": 0.5975, + "mean_token_accuracy": 0.8142675384879112, + "step": 680 + }, + { + "epoch": 0.6742955580164882, + "grad_norm": 0.29252228140830994, + "learning_rate": 1.9065768031586864e-05, + "loss": 0.6056, + "mean_token_accuracy": 0.8120014935731887, + "step": 685 + }, + { + "epoch": 0.6792174234034699, + "grad_norm": 0.2882521450519562, + "learning_rate": 1.9041424289729994e-05, + "loss": 0.595, + "mean_token_accuracy": 0.8150214269757271, + "step": 690 + }, + { + "epoch": 0.6841392887904516, + "grad_norm": 0.29731523990631104, + "learning_rate": 1.901678339886223e-05, + "loss": 0.6013, + "mean_token_accuracy": 0.8131750777363778, + "step": 695 + }, + { + "epoch": 0.6890611541774333, + "grad_norm": 0.26834896206855774, + "learning_rate": 1.8991846168813547e-05, + "loss": 0.5918, + "mean_token_accuracy": 0.8156168267130852, + "step": 700 + }, + { + "epoch": 0.6939830195644149, + "grad_norm": 0.29199543595314026, + "learning_rate": 1.896661341915318e-05, + "loss": 0.6033, + "mean_token_accuracy": 0.8124941572546959, + "step": 705 + }, + { + "epoch": 0.6989048849513966, + "grad_norm": 0.28719085454940796, + "learning_rate": 1.8941085979162714e-05, + "loss": 0.5992, + "mean_token_accuracy": 0.8138533607125282, + "step": 710 + }, + { + "epoch": 0.7038267503383783, + "grad_norm": 0.28042468428611755, + "learning_rate": 1.891526468780881e-05, + "loss": 0.605, + "mean_token_accuracy": 0.8121193930506706, + "step": 715 + }, + { + "epoch": 0.70874861572536, + "grad_norm": 0.272483766078949, + "learning_rate": 1.8889150393715627e-05, + "loss": 0.5943, + "mean_token_accuracy": 0.8147971466183662, + "step": 720 + }, + { + "epoch": 0.7136704811123415, + "grad_norm": 0.24886226654052734, + "learning_rate": 1.8862743955136966e-05, + "loss": 0.5957, + "mean_token_accuracy": 0.8145680665969849, + "step": 725 + }, + { + "epoch": 0.7185923464993232, + "grad_norm": 0.26445212960243225, + "learning_rate": 1.8836046239928025e-05, + "loss": 0.5948, + "mean_token_accuracy": 0.8148575246334075, + "step": 730 + }, + { + "epoch": 0.7235142118863049, + "grad_norm": 0.2891506850719452, + "learning_rate": 1.8809058125516894e-05, + "loss": 0.5968, + "mean_token_accuracy": 0.8141703933477402, + "step": 735 + }, + { + "epoch": 0.7284360772732866, + "grad_norm": 0.28364264965057373, + "learning_rate": 1.8781780498875727e-05, + "loss": 0.6035, + "mean_token_accuracy": 0.8124788105487823, + "step": 740 + }, + { + "epoch": 0.7333579426602682, + "grad_norm": 0.2917366921901703, + "learning_rate": 1.8754214256491564e-05, + "loss": 0.5928, + "mean_token_accuracy": 0.8153851807117463, + "step": 745 + }, + { + "epoch": 0.7382798080472499, + "grad_norm": 0.2714190185070038, + "learning_rate": 1.8726360304336896e-05, + "loss": 0.601, + "mean_token_accuracy": 0.8129221558570862, + "step": 750 + }, + { + "epoch": 0.7432016734342316, + "grad_norm": 0.29474568367004395, + "learning_rate": 1.8698219557839875e-05, + "loss": 0.5963, + "mean_token_accuracy": 0.8142225205898285, + "step": 755 + }, + { + "epoch": 0.7481235388212132, + "grad_norm": 0.2684454619884491, + "learning_rate": 1.866979294185423e-05, + "loss": 0.5933, + "mean_token_accuracy": 0.8149216592311859, + "step": 760 + }, + { + "epoch": 0.7530454042081949, + "grad_norm": 0.26693102717399597, + "learning_rate": 1.864108139062888e-05, + "loss": 0.5908, + "mean_token_accuracy": 0.8157912597060204, + "step": 765 + }, + { + "epoch": 0.7579672695951766, + "grad_norm": 0.27418771386146545, + "learning_rate": 1.8612085847777215e-05, + "loss": 0.5913, + "mean_token_accuracy": 0.8156127855181694, + "step": 770 + }, + { + "epoch": 0.7628891349821583, + "grad_norm": 0.30855274200439453, + "learning_rate": 1.858280726624609e-05, + "loss": 0.5922, + "mean_token_accuracy": 0.81515374481678, + "step": 775 + }, + { + "epoch": 0.7678110003691399, + "grad_norm": 0.2978297472000122, + "learning_rate": 1.855324660828452e-05, + "loss": 0.5999, + "mean_token_accuracy": 0.8132428601384163, + "step": 780 + }, + { + "epoch": 0.7727328657561215, + "grad_norm": 0.30609989166259766, + "learning_rate": 1.8523404845412028e-05, + "loss": 0.5931, + "mean_token_accuracy": 0.8152095600962639, + "step": 785 + }, + { + "epoch": 0.7776547311431032, + "grad_norm": 0.28423747420310974, + "learning_rate": 1.849328295838674e-05, + "loss": 0.5939, + "mean_token_accuracy": 0.8150446817278862, + "step": 790 + }, + { + "epoch": 0.7825765965300849, + "grad_norm": 0.39114367961883545, + "learning_rate": 1.8462881937173144e-05, + "loss": 0.5886, + "mean_token_accuracy": 0.8164272159337997, + "step": 795 + }, + { + "epoch": 0.7874984619170665, + "grad_norm": 0.2761843502521515, + "learning_rate": 1.8432202780909542e-05, + "loss": 0.594, + "mean_token_accuracy": 0.8146432772278785, + "step": 800 + }, + { + "epoch": 0.7924203273040482, + "grad_norm": 0.26402318477630615, + "learning_rate": 1.8401246497875238e-05, + "loss": 0.5892, + "mean_token_accuracy": 0.8162309199571609, + "step": 805 + }, + { + "epoch": 0.7973421926910299, + "grad_norm": 0.26799553632736206, + "learning_rate": 1.8370014105457378e-05, + "loss": 0.5901, + "mean_token_accuracy": 0.8156055212020874, + "step": 810 + }, + { + "epoch": 0.8022640580780116, + "grad_norm": 0.3189884126186371, + "learning_rate": 1.8338506630117527e-05, + "loss": 0.5821, + "mean_token_accuracy": 0.8177683308720589, + "step": 815 + }, + { + "epoch": 0.8071859234649932, + "grad_norm": 0.26993831992149353, + "learning_rate": 1.8306725107357933e-05, + "loss": 0.5887, + "mean_token_accuracy": 0.8162371620535851, + "step": 820 + }, + { + "epoch": 0.8121077888519749, + "grad_norm": 0.33908817172050476, + "learning_rate": 1.827467058168748e-05, + "loss": 0.5932, + "mean_token_accuracy": 0.8148850262165069, + "step": 825 + }, + { + "epoch": 0.8170296542389566, + "grad_norm": 0.2749953866004944, + "learning_rate": 1.824234410658738e-05, + "loss": 0.5807, + "mean_token_accuracy": 0.8185225054621696, + "step": 830 + }, + { + "epoch": 0.8219515196259383, + "grad_norm": 0.28679126501083374, + "learning_rate": 1.8209746744476538e-05, + "loss": 0.5844, + "mean_token_accuracy": 0.81742594987154, + "step": 835 + }, + { + "epoch": 0.8268733850129198, + "grad_norm": 0.29817092418670654, + "learning_rate": 1.817687956667664e-05, + "loss": 0.584, + "mean_token_accuracy": 0.8173492252826691, + "step": 840 + }, + { + "epoch": 0.8317952503999015, + "grad_norm": 0.2705828547477722, + "learning_rate": 1.8143743653376944e-05, + "loss": 0.5955, + "mean_token_accuracy": 0.8145547702908515, + "step": 845 + }, + { + "epoch": 0.8367171157868832, + "grad_norm": 0.28381243348121643, + "learning_rate": 1.811034009359877e-05, + "loss": 0.5833, + "mean_token_accuracy": 0.8177738025784492, + "step": 850 + }, + { + "epoch": 0.8416389811738649, + "grad_norm": 0.2846708595752716, + "learning_rate": 1.8076669985159726e-05, + "loss": 0.5817, + "mean_token_accuracy": 0.8179952159523964, + "step": 855 + }, + { + "epoch": 0.8465608465608465, + "grad_norm": 0.2997231185436249, + "learning_rate": 1.8042734434637615e-05, + "loss": 0.5934, + "mean_token_accuracy": 0.8149283960461616, + "step": 860 + }, + { + "epoch": 0.8514827119478282, + "grad_norm": 0.29204457998275757, + "learning_rate": 1.8008534557334064e-05, + "loss": 0.5795, + "mean_token_accuracy": 0.8184737205505371, + "step": 865 + }, + { + "epoch": 0.8564045773348099, + "grad_norm": 0.30441614985466003, + "learning_rate": 1.7974071477237887e-05, + "loss": 0.585, + "mean_token_accuracy": 0.8171376779675483, + "step": 870 + }, + { + "epoch": 0.8613264427217916, + "grad_norm": 0.2779221832752228, + "learning_rate": 1.7939346326988127e-05, + "loss": 0.5889, + "mean_token_accuracy": 0.8160797134041786, + "step": 875 + }, + { + "epoch": 0.8662483081087732, + "grad_norm": 0.250242680311203, + "learning_rate": 1.7904360247836838e-05, + "loss": 0.5894, + "mean_token_accuracy": 0.81572295576334, + "step": 880 + }, + { + "epoch": 0.8711701734957549, + "grad_norm": 0.26801884174346924, + "learning_rate": 1.7869114389611574e-05, + "loss": 0.5853, + "mean_token_accuracy": 0.8168028473854065, + "step": 885 + }, + { + "epoch": 0.8760920388827366, + "grad_norm": 0.33699533343315125, + "learning_rate": 1.7833609910677613e-05, + "loss": 0.5804, + "mean_token_accuracy": 0.8181165441870689, + "step": 890 + }, + { + "epoch": 0.8810139042697183, + "grad_norm": 0.28362491726875305, + "learning_rate": 1.7797847977899873e-05, + "loss": 0.5823, + "mean_token_accuracy": 0.8177706867456436, + "step": 895 + }, + { + "epoch": 0.8859357696566998, + "grad_norm": 0.2863147556781769, + "learning_rate": 1.7761829766604556e-05, + "loss": 0.5797, + "mean_token_accuracy": 0.8185298308730126, + "step": 900 + }, + { + "epoch": 0.8908576350436815, + "grad_norm": 0.27263742685317993, + "learning_rate": 1.7725556460540553e-05, + "loss": 0.5825, + "mean_token_accuracy": 0.8175166144967079, + "step": 905 + }, + { + "epoch": 0.8957795004306632, + "grad_norm": 0.28120777010917664, + "learning_rate": 1.7689029251840492e-05, + "loss": 0.5788, + "mean_token_accuracy": 0.8185988172888756, + "step": 910 + }, + { + "epoch": 0.9007013658176449, + "grad_norm": 0.3469211459159851, + "learning_rate": 1.7652249340981608e-05, + "loss": 0.5877, + "mean_token_accuracy": 0.8159551978111267, + "step": 915 + }, + { + "epoch": 0.9056232312046265, + "grad_norm": 0.3101508617401123, + "learning_rate": 1.7615217936746246e-05, + "loss": 0.5819, + "mean_token_accuracy": 0.8174650520086288, + "step": 920 + }, + { + "epoch": 0.9105450965916082, + "grad_norm": 0.38838618993759155, + "learning_rate": 1.757793625618217e-05, + "loss": 0.5755, + "mean_token_accuracy": 0.8196040257811547, + "step": 925 + }, + { + "epoch": 0.9154669619785899, + "grad_norm": 0.3253493309020996, + "learning_rate": 1.7540405524562533e-05, + "loss": 0.5777, + "mean_token_accuracy": 0.8182825416326522, + "step": 930 + }, + { + "epoch": 0.9203888273655716, + "grad_norm": 0.2917826175689697, + "learning_rate": 1.750262697534563e-05, + "loss": 0.5809, + "mean_token_accuracy": 0.8180661648511887, + "step": 935 + }, + { + "epoch": 0.9253106927525532, + "grad_norm": 0.25714483857154846, + "learning_rate": 1.7464601850134353e-05, + "loss": 0.5752, + "mean_token_accuracy": 0.8194984391331672, + "step": 940 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.28597357869148254, + "learning_rate": 1.742633139863538e-05, + "loss": 0.579, + "mean_token_accuracy": 0.8184013769030571, + "step": 945 + }, + { + "epoch": 0.9351544235265166, + "grad_norm": 0.9777734875679016, + "learning_rate": 1.738781687861812e-05, + "loss": 0.5789, + "mean_token_accuracy": 0.8188063263893127, + "step": 950 + }, + { + "epoch": 0.9400762889134983, + "grad_norm": 0.26717498898506165, + "learning_rate": 1.7349059555873348e-05, + "loss": 0.5754, + "mean_token_accuracy": 0.8191799059510231, + "step": 955 + }, + { + "epoch": 0.9449981543004798, + "grad_norm": 0.29053807258605957, + "learning_rate": 1.731006070417163e-05, + "loss": 0.5726, + "mean_token_accuracy": 0.8204409092664718, + "step": 960 + }, + { + "epoch": 0.9499200196874615, + "grad_norm": 0.3052172362804413, + "learning_rate": 1.7270821605221448e-05, + "loss": 0.5764, + "mean_token_accuracy": 0.819102555513382, + "step": 965 + }, + { + "epoch": 0.9548418850744432, + "grad_norm": 0.33640167117118835, + "learning_rate": 1.7231343548627085e-05, + "loss": 0.5789, + "mean_token_accuracy": 0.8184890508651733, + "step": 970 + }, + { + "epoch": 0.9597637504614249, + "grad_norm": 0.2829669415950775, + "learning_rate": 1.7191627831846226e-05, + "loss": 0.5803, + "mean_token_accuracy": 0.8179109930992127, + "step": 975 + }, + { + "epoch": 0.9646856158484065, + "grad_norm": 0.2560986280441284, + "learning_rate": 1.7151675760147325e-05, + "loss": 0.5721, + "mean_token_accuracy": 0.8198479250073433, + "step": 980 + }, + { + "epoch": 0.9696074812353882, + "grad_norm": 0.27663761377334595, + "learning_rate": 1.7111488646566728e-05, + "loss": 0.5851, + "mean_token_accuracy": 0.8171452388167382, + "step": 985 + }, + { + "epoch": 0.9745293466223699, + "grad_norm": 0.2673356235027313, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.5751, + "mean_token_accuracy": 0.8194502517580986, + "step": 990 + }, + { + "epoch": 0.9794512120093516, + "grad_norm": 0.2639131546020508, + "learning_rate": 1.7030414584485938e-05, + "loss": 0.5757, + "mean_token_accuracy": 0.8192202031612397, + "step": 995 + }, + { + "epoch": 0.9843730773963332, + "grad_norm": 0.2639618515968323, + "learning_rate": 1.6989530300508126e-05, + "loss": 0.576, + "mean_token_accuracy": 0.8191347226500512, + "step": 1000 + }, + { + "epoch": 0.9892949427833149, + "grad_norm": 0.2554817199707031, + "learning_rate": 1.6948416303605796e-05, + "loss": 0.5778, + "mean_token_accuracy": 0.8186899140477181, + "step": 1005 + }, + { + "epoch": 0.9942168081702966, + "grad_norm": 0.25301820039749146, + "learning_rate": 1.690707394500229e-05, + "loss": 0.576, + "mean_token_accuracy": 0.8191317170858383, + "step": 1010 + }, + { + "epoch": 0.9991386735572783, + "grad_norm": 0.2470293790102005, + "learning_rate": 1.6865504583426117e-05, + "loss": 0.5707, + "mean_token_accuracy": 0.8204790607094765, + "step": 1015 + }, + { + "epoch": 1.0049218653869816, + "grad_norm": 0.3501671254634857, + "learning_rate": 1.6823709585066308e-05, + "loss": 0.6648, + "mean_token_accuracy": 0.824617318990754, + "step": 1020 + }, + { + "epoch": 1.0098437307739634, + "grad_norm": 0.30985623598098755, + "learning_rate": 1.6781690323527512e-05, + "loss": 0.5503, + "mean_token_accuracy": 0.8255873426795006, + "step": 1025 + }, + { + "epoch": 1.014765596160945, + "grad_norm": 0.2879364788532257, + "learning_rate": 1.6739448179784846e-05, + "loss": 0.5529, + "mean_token_accuracy": 0.8247572600841522, + "step": 1030 + }, + { + "epoch": 1.0196874615479268, + "grad_norm": 0.27657514810562134, + "learning_rate": 1.669698454213852e-05, + "loss": 0.55, + "mean_token_accuracy": 0.8258542969822884, + "step": 1035 + }, + { + "epoch": 1.0246093269349084, + "grad_norm": 0.259316623210907, + "learning_rate": 1.665430080616821e-05, + "loss": 0.5435, + "mean_token_accuracy": 0.8273309215903282, + "step": 1040 + }, + { + "epoch": 1.02953119232189, + "grad_norm": 0.27227073907852173, + "learning_rate": 1.6611398374687172e-05, + "loss": 0.5494, + "mean_token_accuracy": 0.8259153485298156, + "step": 1045 + }, + { + "epoch": 1.0344530577088717, + "grad_norm": 0.2718289792537689, + "learning_rate": 1.6568278657696166e-05, + "loss": 0.5445, + "mean_token_accuracy": 0.827112241089344, + "step": 1050 + }, + { + "epoch": 1.0393749230958533, + "grad_norm": 0.28744345903396606, + "learning_rate": 1.6524943072337094e-05, + "loss": 0.5501, + "mean_token_accuracy": 0.8256638810038567, + "step": 1055 + }, + { + "epoch": 1.044296788482835, + "grad_norm": 0.26266416907310486, + "learning_rate": 1.6481393042846442e-05, + "loss": 0.5467, + "mean_token_accuracy": 0.8264568135142326, + "step": 1060 + }, + { + "epoch": 1.0492186538698167, + "grad_norm": 0.25888925790786743, + "learning_rate": 1.6437630000508466e-05, + "loss": 0.5522, + "mean_token_accuracy": 0.8247309610247612, + "step": 1065 + }, + { + "epoch": 1.0541405192567983, + "grad_norm": 0.25061705708503723, + "learning_rate": 1.6393655383608132e-05, + "loss": 0.5459, + "mean_token_accuracy": 0.8267670929431915, + "step": 1070 + }, + { + "epoch": 1.0590623846437799, + "grad_norm": 0.25011131167411804, + "learning_rate": 1.634947063738389e-05, + "loss": 0.5483, + "mean_token_accuracy": 0.8261876925826073, + "step": 1075 + }, + { + "epoch": 1.0639842500307617, + "grad_norm": 0.26051655411720276, + "learning_rate": 1.630507721398013e-05, + "loss": 0.5452, + "mean_token_accuracy": 0.82709851115942, + "step": 1080 + }, + { + "epoch": 1.0689061154177433, + "grad_norm": 0.2643815279006958, + "learning_rate": 1.6260476572399494e-05, + "loss": 0.5497, + "mean_token_accuracy": 0.825461483001709, + "step": 1085 + }, + { + "epoch": 1.073827980804725, + "grad_norm": 0.3040525019168854, + "learning_rate": 1.6215670178454893e-05, + "loss": 0.5478, + "mean_token_accuracy": 0.8264098614454269, + "step": 1090 + }, + { + "epoch": 1.0787498461917067, + "grad_norm": 0.28461357951164246, + "learning_rate": 1.6170659504721365e-05, + "loss": 0.5474, + "mean_token_accuracy": 0.8261038646101951, + "step": 1095 + }, + { + "epoch": 1.0836717115786882, + "grad_norm": 0.24723611772060394, + "learning_rate": 1.6125446030487642e-05, + "loss": 0.542, + "mean_token_accuracy": 0.8277976959943771, + "step": 1100 + }, + { + "epoch": 1.08859357696567, + "grad_norm": 0.4478602707386017, + "learning_rate": 1.608003124170758e-05, + "loss": 0.5435, + "mean_token_accuracy": 0.8271990329027176, + "step": 1105 + }, + { + "epoch": 1.0935154423526516, + "grad_norm": 0.2758786082267761, + "learning_rate": 1.6034416630951265e-05, + "loss": 0.5546, + "mean_token_accuracy": 0.8245001256465911, + "step": 1110 + }, + { + "epoch": 1.0984373077396332, + "grad_norm": 0.8616223335266113, + "learning_rate": 1.598860369735601e-05, + "loss": 0.5419, + "mean_token_accuracy": 0.827488873898983, + "step": 1115 + }, + { + "epoch": 1.103359173126615, + "grad_norm": 0.24690531194210052, + "learning_rate": 1.594259394657707e-05, + "loss": 0.5493, + "mean_token_accuracy": 0.8259517803788186, + "step": 1120 + }, + { + "epoch": 1.1082810385135966, + "grad_norm": 0.24601490795612335, + "learning_rate": 1.589638889073813e-05, + "loss": 0.5563, + "mean_token_accuracy": 0.8240275859832764, + "step": 1125 + }, + { + "epoch": 1.1132029039005784, + "grad_norm": 0.32801708579063416, + "learning_rate": 1.584999004838165e-05, + "loss": 0.5474, + "mean_token_accuracy": 0.8265691444277763, + "step": 1130 + }, + { + "epoch": 1.11812476928756, + "grad_norm": 0.25093355774879456, + "learning_rate": 1.5803398944418934e-05, + "loss": 0.5426, + "mean_token_accuracy": 0.8273544386029243, + "step": 1135 + }, + { + "epoch": 1.1230466346745416, + "grad_norm": 0.2600312829017639, + "learning_rate": 1.5756617110080023e-05, + "loss": 0.5522, + "mean_token_accuracy": 0.8249027922749519, + "step": 1140 + }, + { + "epoch": 1.1279685000615234, + "grad_norm": 0.26066142320632935, + "learning_rate": 1.570964608286336e-05, + "loss": 0.5442, + "mean_token_accuracy": 0.8270187392830849, + "step": 1145 + }, + { + "epoch": 1.132890365448505, + "grad_norm": 0.27738282084465027, + "learning_rate": 1.5662487406485273e-05, + "loss": 0.5361, + "mean_token_accuracy": 0.8295004799962044, + "step": 1150 + }, + { + "epoch": 1.1378122308354865, + "grad_norm": 0.3502300977706909, + "learning_rate": 1.561514263082923e-05, + "loss": 0.5482, + "mean_token_accuracy": 0.8256632193922997, + "step": 1155 + }, + { + "epoch": 1.1427340962224684, + "grad_norm": 0.5840310454368591, + "learning_rate": 1.5567613311894908e-05, + "loss": 0.5337, + "mean_token_accuracy": 0.8303180441260338, + "step": 1160 + }, + { + "epoch": 1.14765596160945, + "grad_norm": 0.2714439034461975, + "learning_rate": 1.5519901011747046e-05, + "loss": 0.5479, + "mean_token_accuracy": 0.8258592769503593, + "step": 1165 + }, + { + "epoch": 1.1525778269964317, + "grad_norm": 0.2692211866378784, + "learning_rate": 1.5472007298464117e-05, + "loss": 0.5439, + "mean_token_accuracy": 0.8271799921989441, + "step": 1170 + }, + { + "epoch": 1.1574996923834133, + "grad_norm": 0.2637535631656647, + "learning_rate": 1.5423933746086793e-05, + "loss": 0.5382, + "mean_token_accuracy": 0.8288466781377792, + "step": 1175 + }, + { + "epoch": 1.162421557770395, + "grad_norm": 0.25311315059661865, + "learning_rate": 1.5375681934566203e-05, + "loss": 0.5399, + "mean_token_accuracy": 0.8281501397490502, + "step": 1180 + }, + { + "epoch": 1.1673434231573767, + "grad_norm": 0.25321346521377563, + "learning_rate": 1.532725344971202e-05, + "loss": 0.5482, + "mean_token_accuracy": 0.8261646762490272, + "step": 1185 + }, + { + "epoch": 1.1722652885443583, + "grad_norm": 0.25499051809310913, + "learning_rate": 1.527864988314033e-05, + "loss": 0.5425, + "mean_token_accuracy": 0.8275581628084183, + "step": 1190 + }, + { + "epoch": 1.17718715393134, + "grad_norm": 0.2546637952327728, + "learning_rate": 1.5229872832221336e-05, + "loss": 0.5397, + "mean_token_accuracy": 0.8283757612109184, + "step": 1195 + }, + { + "epoch": 1.1821090193183217, + "grad_norm": 0.2738707363605499, + "learning_rate": 1.5180923900026847e-05, + "loss": 0.5386, + "mean_token_accuracy": 0.8282813474535942, + "step": 1200 + }, + { + "epoch": 1.1870308847053033, + "grad_norm": 0.2539266347885132, + "learning_rate": 1.5131804695277612e-05, + "loss": 0.5462, + "mean_token_accuracy": 0.826425202190876, + "step": 1205 + }, + { + "epoch": 1.1919527500922849, + "grad_norm": 0.2745126187801361, + "learning_rate": 1.5082516832290424e-05, + "loss": 0.5404, + "mean_token_accuracy": 0.8284027636051178, + "step": 1210 + }, + { + "epoch": 1.1968746154792667, + "grad_norm": 0.2544495165348053, + "learning_rate": 1.5033061930925081e-05, + "loss": 0.532, + "mean_token_accuracy": 0.8300672218203544, + "step": 1215 + }, + { + "epoch": 1.2017964808662482, + "grad_norm": 0.27299556136131287, + "learning_rate": 1.4983441616531152e-05, + "loss": 0.5396, + "mean_token_accuracy": 0.8280036672949791, + "step": 1220 + }, + { + "epoch": 1.20671834625323, + "grad_norm": 0.28981074690818787, + "learning_rate": 1.4933657519894542e-05, + "loss": 0.5524, + "mean_token_accuracy": 0.8247063636779786, + "step": 1225 + }, + { + "epoch": 1.2116402116402116, + "grad_norm": 0.30510908365249634, + "learning_rate": 1.4883711277183917e-05, + "loss": 0.5379, + "mean_token_accuracy": 0.8288484767079354, + "step": 1230 + }, + { + "epoch": 1.2165620770271932, + "grad_norm": 0.2616790533065796, + "learning_rate": 1.483360452989691e-05, + "loss": 0.5415, + "mean_token_accuracy": 0.8275775909423828, + "step": 1235 + }, + { + "epoch": 1.221483942414175, + "grad_norm": 0.2551945745944977, + "learning_rate": 1.4783338924806191e-05, + "loss": 0.5347, + "mean_token_accuracy": 0.8295770674943924, + "step": 1240 + }, + { + "epoch": 1.2264058078011566, + "grad_norm": 0.28227224946022034, + "learning_rate": 1.4732916113905336e-05, + "loss": 0.5425, + "mean_token_accuracy": 0.8273839592933655, + "step": 1245 + }, + { + "epoch": 1.2313276731881384, + "grad_norm": 0.260978102684021, + "learning_rate": 1.4682337754354534e-05, + "loss": 0.5431, + "mean_token_accuracy": 0.8270445480942726, + "step": 1250 + }, + { + "epoch": 1.23624953857512, + "grad_norm": 0.279462605714798, + "learning_rate": 1.4631605508426124e-05, + "loss": 0.5379, + "mean_token_accuracy": 0.828822860121727, + "step": 1255 + }, + { + "epoch": 1.2411714039621016, + "grad_norm": 0.2665978670120239, + "learning_rate": 1.4580721043449968e-05, + "loss": 0.5403, + "mean_token_accuracy": 0.8279185205698013, + "step": 1260 + }, + { + "epoch": 1.2460932693490834, + "grad_norm": 0.24216796457767487, + "learning_rate": 1.4529686031758642e-05, + "loss": 0.5409, + "mean_token_accuracy": 0.8280630350112915, + "step": 1265 + }, + { + "epoch": 1.251015134736065, + "grad_norm": 0.2504848837852478, + "learning_rate": 1.4478502150632503e-05, + "loss": 0.5389, + "mean_token_accuracy": 0.8282234400510788, + "step": 1270 + }, + { + "epoch": 1.2559370001230468, + "grad_norm": 0.25835323333740234, + "learning_rate": 1.4427171082244523e-05, + "loss": 0.5471, + "mean_token_accuracy": 0.8258385419845581, + "step": 1275 + }, + { + "epoch": 1.2608588655100283, + "grad_norm": 0.26074373722076416, + "learning_rate": 1.4375694513605037e-05, + "loss": 0.5413, + "mean_token_accuracy": 0.8273946106433868, + "step": 1280 + }, + { + "epoch": 1.26578073089701, + "grad_norm": 0.2714027762413025, + "learning_rate": 1.4324074136506283e-05, + "loss": 0.5399, + "mean_token_accuracy": 0.8278847292065621, + "step": 1285 + }, + { + "epoch": 1.2707025962839915, + "grad_norm": 0.24950872361660004, + "learning_rate": 1.427231164746681e-05, + "loss": 0.5429, + "mean_token_accuracy": 0.827368488907814, + "step": 1290 + }, + { + "epoch": 1.2756244616709733, + "grad_norm": 0.2415134608745575, + "learning_rate": 1.4220408747675714e-05, + "loss": 0.5417, + "mean_token_accuracy": 0.8275652229785919, + "step": 1295 + }, + { + "epoch": 1.280546327057955, + "grad_norm": 0.23719871044158936, + "learning_rate": 1.4168367142936736e-05, + "loss": 0.5442, + "mean_token_accuracy": 0.8268394738435745, + "step": 1300 + }, + { + "epoch": 1.2854681924449367, + "grad_norm": 0.2537670135498047, + "learning_rate": 1.4116188543612182e-05, + "loss": 0.5329, + "mean_token_accuracy": 0.8299818679690361, + "step": 1305 + }, + { + "epoch": 1.2903900578319183, + "grad_norm": 0.2709537446498871, + "learning_rate": 1.4063874664566734e-05, + "loss": 0.5419, + "mean_token_accuracy": 0.8275921046733856, + "step": 1310 + }, + { + "epoch": 1.2953119232188999, + "grad_norm": 0.26924365758895874, + "learning_rate": 1.4011427225111091e-05, + "loss": 0.5321, + "mean_token_accuracy": 0.8305203005671501, + "step": 1315 + }, + { + "epoch": 1.3002337886058817, + "grad_norm": 0.2832610607147217, + "learning_rate": 1.3958847948945428e-05, + "loss": 0.5391, + "mean_token_accuracy": 0.8282249644398689, + "step": 1320 + }, + { + "epoch": 1.3051556539928633, + "grad_norm": 0.2596539258956909, + "learning_rate": 1.3906138564102794e-05, + "loss": 0.5356, + "mean_token_accuracy": 0.829230573773384, + "step": 1325 + }, + { + "epoch": 1.310077519379845, + "grad_norm": 0.2699119448661804, + "learning_rate": 1.3853300802892285e-05, + "loss": 0.5417, + "mean_token_accuracy": 0.8279038980603218, + "step": 1330 + }, + { + "epoch": 1.3149993847668267, + "grad_norm": 0.2658538520336151, + "learning_rate": 1.380033640184213e-05, + "loss": 0.5462, + "mean_token_accuracy": 0.8260830625891685, + "step": 1335 + }, + { + "epoch": 1.3199212501538082, + "grad_norm": 0.25977060198783875, + "learning_rate": 1.3747247101642605e-05, + "loss": 0.5347, + "mean_token_accuracy": 0.8293716937303544, + "step": 1340 + }, + { + "epoch": 1.32484311554079, + "grad_norm": 0.24537616968154907, + "learning_rate": 1.369403464708884e-05, + "loss": 0.5367, + "mean_token_accuracy": 0.8292932540178299, + "step": 1345 + }, + { + "epoch": 1.3297649809277716, + "grad_norm": 0.2559899091720581, + "learning_rate": 1.3640700787023465e-05, + "loss": 0.5398, + "mean_token_accuracy": 0.8283236369490623, + "step": 1350 + }, + { + "epoch": 1.3346868463147534, + "grad_norm": 0.274198979139328, + "learning_rate": 1.358724727427914e-05, + "loss": 0.5376, + "mean_token_accuracy": 0.8286082163453102, + "step": 1355 + }, + { + "epoch": 1.339608711701735, + "grad_norm": 0.22712701559066772, + "learning_rate": 1.3533675865620937e-05, + "loss": 0.5336, + "mean_token_accuracy": 0.8294816762208939, + "step": 1360 + }, + { + "epoch": 1.3445305770887166, + "grad_norm": 0.24095574021339417, + "learning_rate": 1.3479988321688619e-05, + "loss": 0.536, + "mean_token_accuracy": 0.829172083735466, + "step": 1365 + }, + { + "epoch": 1.3494524424756982, + "grad_norm": 0.2448059618473053, + "learning_rate": 1.3426186406938769e-05, + "loss": 0.5337, + "mean_token_accuracy": 0.8295143947005272, + "step": 1370 + }, + { + "epoch": 1.35437430786268, + "grad_norm": 0.2575864791870117, + "learning_rate": 1.337227188958679e-05, + "loss": 0.5456, + "mean_token_accuracy": 0.8261685460805893, + "step": 1375 + }, + { + "epoch": 1.3592961732496616, + "grad_norm": 0.25145259499549866, + "learning_rate": 1.3318246541548812e-05, + "loss": 0.5319, + "mean_token_accuracy": 0.8304190933704376, + "step": 1380 + }, + { + "epoch": 1.3642180386366434, + "grad_norm": 0.2565249502658844, + "learning_rate": 1.3264112138383445e-05, + "loss": 0.5358, + "mean_token_accuracy": 0.8293601229786873, + "step": 1385 + }, + { + "epoch": 1.369139904023625, + "grad_norm": 0.8961818814277649, + "learning_rate": 1.3209870459233422e-05, + "loss": 0.528, + "mean_token_accuracy": 0.8313272252678872, + "step": 1390 + }, + { + "epoch": 1.3740617694106065, + "grad_norm": 0.26537856459617615, + "learning_rate": 1.315552328676714e-05, + "loss": 0.531, + "mean_token_accuracy": 0.8308784514665604, + "step": 1395 + }, + { + "epoch": 1.3789836347975883, + "grad_norm": 0.28985780477523804, + "learning_rate": 1.3101072407120056e-05, + "loss": 0.5406, + "mean_token_accuracy": 0.8277209624648094, + "step": 1400 + }, + { + "epoch": 1.38390550018457, + "grad_norm": 0.2510998249053955, + "learning_rate": 1.3046519609836002e-05, + "loss": 0.5406, + "mean_token_accuracy": 0.827545890212059, + "step": 1405 + }, + { + "epoch": 1.3888273655715517, + "grad_norm": 0.2563679814338684, + "learning_rate": 1.2991866687808355e-05, + "loss": 0.5394, + "mean_token_accuracy": 0.8279638543725014, + "step": 1410 + }, + { + "epoch": 1.3937492309585333, + "grad_norm": 0.2674863338470459, + "learning_rate": 1.2937115437221119e-05, + "loss": 0.547, + "mean_token_accuracy": 0.8261717170476913, + "step": 1415 + }, + { + "epoch": 1.398671096345515, + "grad_norm": 0.24103465676307678, + "learning_rate": 1.2882267657489908e-05, + "loss": 0.5428, + "mean_token_accuracy": 0.8272509336471557, + "step": 1420 + }, + { + "epoch": 1.4035929617324965, + "grad_norm": 0.22528545558452606, + "learning_rate": 1.2827325151202783e-05, + "loss": 0.5368, + "mean_token_accuracy": 0.8288370996713639, + "step": 1425 + }, + { + "epoch": 1.4085148271194783, + "grad_norm": 0.23950906097888947, + "learning_rate": 1.2772289724061015e-05, + "loss": 0.5309, + "mean_token_accuracy": 0.8302434518933296, + "step": 1430 + }, + { + "epoch": 1.4134366925064599, + "grad_norm": 0.22913850843906403, + "learning_rate": 1.2717163184819761e-05, + "loss": 0.5397, + "mean_token_accuracy": 0.8278713747859001, + "step": 1435 + }, + { + "epoch": 1.4183585578934417, + "grad_norm": 0.22565315663814545, + "learning_rate": 1.2661947345228593e-05, + "loss": 0.546, + "mean_token_accuracy": 0.826079449057579, + "step": 1440 + }, + { + "epoch": 1.4232804232804233, + "grad_norm": 0.2397647351026535, + "learning_rate": 1.2606644019971967e-05, + "loss": 0.5396, + "mean_token_accuracy": 0.8280595645308495, + "step": 1445 + }, + { + "epoch": 1.4282022886674048, + "grad_norm": 0.23136766254901886, + "learning_rate": 1.255125502660958e-05, + "loss": 0.5288, + "mean_token_accuracy": 0.8313645005226136, + "step": 1450 + }, + { + "epoch": 1.4331241540543866, + "grad_norm": 0.2330116331577301, + "learning_rate": 1.2495782185516638e-05, + "loss": 0.5364, + "mean_token_accuracy": 0.828608725965023, + "step": 1455 + }, + { + "epoch": 1.4380460194413682, + "grad_norm": 0.23435364663600922, + "learning_rate": 1.2440227319824024e-05, + "loss": 0.5323, + "mean_token_accuracy": 0.8299019247293472, + "step": 1460 + }, + { + "epoch": 1.44296788482835, + "grad_norm": 0.2517502009868622, + "learning_rate": 1.2384592255358385e-05, + "loss": 0.537, + "mean_token_accuracy": 0.8284672737121582, + "step": 1465 + }, + { + "epoch": 1.4478897502153316, + "grad_norm": 0.2454364001750946, + "learning_rate": 1.2328878820582122e-05, + "loss": 0.5282, + "mean_token_accuracy": 0.8314993128180503, + "step": 1470 + }, + { + "epoch": 1.4528116156023132, + "grad_norm": 0.2604913115501404, + "learning_rate": 1.2273088846533303e-05, + "loss": 0.5404, + "mean_token_accuracy": 0.8278495371341705, + "step": 1475 + }, + { + "epoch": 1.457733480989295, + "grad_norm": 0.277908593416214, + "learning_rate": 1.2217224166765478e-05, + "loss": 0.5285, + "mean_token_accuracy": 0.8310411602258683, + "step": 1480 + }, + { + "epoch": 1.4626553463762766, + "grad_norm": 0.23699437081813812, + "learning_rate": 1.216128661728742e-05, + "loss": 0.5359, + "mean_token_accuracy": 0.8288247928023338, + "step": 1485 + }, + { + "epoch": 1.4675772117632584, + "grad_norm": 0.2528901994228363, + "learning_rate": 1.2105278036502787e-05, + "loss": 0.543, + "mean_token_accuracy": 0.8267820864915848, + "step": 1490 + }, + { + "epoch": 1.47249907715024, + "grad_norm": 0.25504714250564575, + "learning_rate": 1.204920026514971e-05, + "loss": 0.5391, + "mean_token_accuracy": 0.8281295597553253, + "step": 1495 + }, + { + "epoch": 1.4774209425372216, + "grad_norm": 0.26783859729766846, + "learning_rate": 1.1993055146240273e-05, + "loss": 0.5325, + "mean_token_accuracy": 0.8299062862992287, + "step": 1500 + }, + { + "epoch": 1.4823428079242031, + "grad_norm": 0.25482243299484253, + "learning_rate": 1.1936844524999966e-05, + "loss": 0.5271, + "mean_token_accuracy": 0.8315476939082146, + "step": 1505 + }, + { + "epoch": 1.487264673311185, + "grad_norm": 0.2603563964366913, + "learning_rate": 1.1880570248807033e-05, + "loss": 0.5299, + "mean_token_accuracy": 0.8303808271884918, + "step": 1510 + }, + { + "epoch": 1.4921865386981665, + "grad_norm": 0.2345011830329895, + "learning_rate": 1.1824234167131748e-05, + "loss": 0.5274, + "mean_token_accuracy": 0.8310874328017235, + "step": 1515 + }, + { + "epoch": 1.4971084040851483, + "grad_norm": 0.3448658883571625, + "learning_rate": 1.1767838131475654e-05, + "loss": 0.5318, + "mean_token_accuracy": 0.8301808550953865, + "step": 1520 + }, + { + "epoch": 1.50203026947213, + "grad_norm": 0.26358914375305176, + "learning_rate": 1.171138399531068e-05, + "loss": 0.5341, + "mean_token_accuracy": 0.8296466439962387, + "step": 1525 + }, + { + "epoch": 1.5069521348591115, + "grad_norm": 0.23463788628578186, + "learning_rate": 1.1654873614018266e-05, + "loss": 0.5337, + "mean_token_accuracy": 0.8297147572040557, + "step": 1530 + }, + { + "epoch": 1.5118740002460933, + "grad_norm": 0.37559443712234497, + "learning_rate": 1.1598308844828348e-05, + "loss": 0.5281, + "mean_token_accuracy": 0.8311620846390724, + "step": 1535 + }, + { + "epoch": 1.516795865633075, + "grad_norm": 0.24298147857189178, + "learning_rate": 1.1541691546758343e-05, + "loss": 0.5353, + "mean_token_accuracy": 0.8288328930735588, + "step": 1540 + }, + { + "epoch": 1.5217177310200567, + "grad_norm": 0.2316361665725708, + "learning_rate": 1.1485023580552039e-05, + "loss": 0.5217, + "mean_token_accuracy": 0.8330785930156708, + "step": 1545 + }, + { + "epoch": 1.5266395964070383, + "grad_norm": 0.22819174826145172, + "learning_rate": 1.1428306808618456e-05, + "loss": 0.53, + "mean_token_accuracy": 0.8303656697273254, + "step": 1550 + }, + { + "epoch": 1.5315614617940199, + "grad_norm": 0.22326573729515076, + "learning_rate": 1.1371543094970624e-05, + "loss": 0.53, + "mean_token_accuracy": 0.8304451867938042, + "step": 1555 + }, + { + "epoch": 1.5364833271810014, + "grad_norm": 0.23267020285129547, + "learning_rate": 1.131473430516432e-05, + "loss": 0.5284, + "mean_token_accuracy": 0.8309284761548043, + "step": 1560 + }, + { + "epoch": 1.5414051925679833, + "grad_norm": 0.3377299904823303, + "learning_rate": 1.1257882306236776e-05, + "loss": 0.5336, + "mean_token_accuracy": 0.8295429393649101, + "step": 1565 + }, + { + "epoch": 1.546327057954965, + "grad_norm": 0.24768434464931488, + "learning_rate": 1.1200988966645286e-05, + "loss": 0.5326, + "mean_token_accuracy": 0.8297705203294754, + "step": 1570 + }, + { + "epoch": 1.5512489233419466, + "grad_norm": 0.22998486459255219, + "learning_rate": 1.1144056156205834e-05, + "loss": 0.5298, + "mean_token_accuracy": 0.8307420760393143, + "step": 1575 + }, + { + "epoch": 1.5561707887289282, + "grad_norm": 0.22251376509666443, + "learning_rate": 1.1087085746031612e-05, + "loss": 0.528, + "mean_token_accuracy": 0.8313020512461662, + "step": 1580 + }, + { + "epoch": 1.5610926541159098, + "grad_norm": 0.2297334372997284, + "learning_rate": 1.1030079608471544e-05, + "loss": 0.5335, + "mean_token_accuracy": 0.8294809475541115, + "step": 1585 + }, + { + "epoch": 1.5660145195028916, + "grad_norm": 0.23138615489006042, + "learning_rate": 1.0973039617048748e-05, + "loss": 0.5333, + "mean_token_accuracy": 0.829520358145237, + "step": 1590 + }, + { + "epoch": 1.5709363848898734, + "grad_norm": 0.23547935485839844, + "learning_rate": 1.091596764639895e-05, + "loss": 0.5267, + "mean_token_accuracy": 0.8314588502049446, + "step": 1595 + }, + { + "epoch": 1.575858250276855, + "grad_norm": 0.2409500926733017, + "learning_rate": 1.0858865572208892e-05, + "loss": 0.5346, + "mean_token_accuracy": 0.8291632473468781, + "step": 1600 + }, + { + "epoch": 1.5807801156638366, + "grad_norm": 0.2276252955198288, + "learning_rate": 1.080173527115467e-05, + "loss": 0.5273, + "mean_token_accuracy": 0.831089685857296, + "step": 1605 + }, + { + "epoch": 1.5857019810508182, + "grad_norm": 0.2589430809020996, + "learning_rate": 1.0744578620840065e-05, + "loss": 0.5388, + "mean_token_accuracy": 0.8279580160975456, + "step": 1610 + }, + { + "epoch": 1.5906238464378, + "grad_norm": 0.2499450445175171, + "learning_rate": 1.0687397499734842e-05, + "loss": 0.5268, + "mean_token_accuracy": 0.8311406090855599, + "step": 1615 + }, + { + "epoch": 1.5955457118247816, + "grad_norm": 0.2377663552761078, + "learning_rate": 1.0630193787112994e-05, + "loss": 0.5257, + "mean_token_accuracy": 0.8319837361574173, + "step": 1620 + }, + { + "epoch": 1.6004675772117634, + "grad_norm": 0.24260112643241882, + "learning_rate": 1.0572969362991e-05, + "loss": 0.5316, + "mean_token_accuracy": 0.8302173331379891, + "step": 1625 + }, + { + "epoch": 1.605389442598745, + "grad_norm": 1.525187611579895, + "learning_rate": 1.0515726108066025e-05, + "loss": 0.5315, + "mean_token_accuracy": 0.8299267381429672, + "step": 1630 + }, + { + "epoch": 1.6103113079857265, + "grad_norm": 0.23062676191329956, + "learning_rate": 1.0458465903654107e-05, + "loss": 0.5298, + "mean_token_accuracy": 0.8305988430976867, + "step": 1635 + }, + { + "epoch": 1.615233173372708, + "grad_norm": 0.23293638229370117, + "learning_rate": 1.0401190631628348e-05, + "loss": 0.5304, + "mean_token_accuracy": 0.8300972327589988, + "step": 1640 + }, + { + "epoch": 1.62015503875969, + "grad_norm": 0.22877627611160278, + "learning_rate": 1.034390217435704e-05, + "loss": 0.5287, + "mean_token_accuracy": 0.8309306666254997, + "step": 1645 + }, + { + "epoch": 1.6250769041466717, + "grad_norm": 0.23190174996852875, + "learning_rate": 1.0286602414641818e-05, + "loss": 0.5303, + "mean_token_accuracy": 0.8306381091475487, + "step": 1650 + }, + { + "epoch": 1.6299987695336533, + "grad_norm": 0.23290394246578217, + "learning_rate": 1.0229293235655768e-05, + "loss": 0.5221, + "mean_token_accuracy": 0.8326445773243905, + "step": 1655 + }, + { + "epoch": 1.6349206349206349, + "grad_norm": 0.22114625573158264, + "learning_rate": 1.0171976520881552e-05, + "loss": 0.5263, + "mean_token_accuracy": 0.8315576672554016, + "step": 1660 + }, + { + "epoch": 1.6398425003076165, + "grad_norm": 0.2297578752040863, + "learning_rate": 1.011465415404949e-05, + "loss": 0.5252, + "mean_token_accuracy": 0.8321317434310913, + "step": 1665 + }, + { + "epoch": 1.6447643656945983, + "grad_norm": 0.23588469624519348, + "learning_rate": 1.005732801907567e-05, + "loss": 0.5262, + "mean_token_accuracy": 0.831513050198555, + "step": 1670 + }, + { + "epoch": 1.64968623108158, + "grad_norm": 0.22704197466373444, + "learning_rate": 1e-05, + "loss": 0.5382, + "mean_token_accuracy": 0.8281245142221451, + "step": 1675 + }, + { + "epoch": 1.6546080964685617, + "grad_norm": 0.22588326036930084, + "learning_rate": 9.942671980924336e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8307414755225182, + "step": 1680 + }, + { + "epoch": 1.6595299618555432, + "grad_norm": 0.22511065006256104, + "learning_rate": 9.88534584595051e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.83111013174057, + "step": 1685 + }, + { + "epoch": 1.6644518272425248, + "grad_norm": 0.24989110231399536, + "learning_rate": 9.82802347911845e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8317268043756485, + "step": 1690 + }, + { + "epoch": 1.6693736926295066, + "grad_norm": 0.23859356343746185, + "learning_rate": 9.770706764344235e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8294050306081772, + "step": 1695 + }, + { + "epoch": 1.6742955580164882, + "grad_norm": 0.2304782122373581, + "learning_rate": 9.713397585358189e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8308202102780342, + "step": 1700 + }, + { + "epoch": 1.67921742340347, + "grad_norm": 0.2276812344789505, + "learning_rate": 9.65609782564296e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8312249034643173, + "step": 1705 + }, + { + "epoch": 1.6841392887904516, + "grad_norm": 0.3979962170124054, + "learning_rate": 9.598809368371656e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8312003433704376, + "step": 1710 + }, + { + "epoch": 1.6890611541774332, + "grad_norm": 0.25581249594688416, + "learning_rate": 9.541534096345896e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8315127685666084, + "step": 1715 + }, + { + "epoch": 1.6939830195644148, + "grad_norm": 0.2141893208026886, + "learning_rate": 9.484273891933982e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8317378848791123, + "step": 1720 + }, + { + "epoch": 1.6989048849513966, + "grad_norm": 0.4327445924282074, + "learning_rate": 9.427030637009002e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.828312310576439, + "step": 1725 + }, + { + "epoch": 1.7038267503383784, + "grad_norm": 0.22412188351154327, + "learning_rate": 9.369806212887008e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.830331552028656, + "step": 1730 + }, + { + "epoch": 1.70874861572536, + "grad_norm": 0.22056014835834503, + "learning_rate": 9.312602500265162e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.831749576330185, + "step": 1735 + }, + { + "epoch": 1.7136704811123415, + "grad_norm": 0.23633216321468353, + "learning_rate": 9.255421379159935e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8346669390797615, + "step": 1740 + }, + { + "epoch": 1.7185923464993231, + "grad_norm": 0.21674410998821259, + "learning_rate": 9.198264728845332e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8335284858942031, + "step": 1745 + }, + { + "epoch": 1.723514211886305, + "grad_norm": 0.22083686292171478, + "learning_rate": 9.14113442779111e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8306051269173622, + "step": 1750 + }, + { + "epoch": 1.7284360772732867, + "grad_norm": 0.2326516956090927, + "learning_rate": 9.084032353601053e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8295654147863388, + "step": 1755 + }, + { + "epoch": 1.7333579426602683, + "grad_norm": 0.23140785098075867, + "learning_rate": 9.026960382951253e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8315977454185486, + "step": 1760 + }, + { + "epoch": 1.73827980804725, + "grad_norm": 0.24312028288841248, + "learning_rate": 8.969920391528459e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8328249961137771, + "step": 1765 + }, + { + "epoch": 1.7432016734342315, + "grad_norm": 0.22412382066249847, + "learning_rate": 8.912914253968391e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8298890963196754, + "step": 1770 + }, + { + "epoch": 1.748123538821213, + "grad_norm": 0.2266296148300171, + "learning_rate": 8.855943843794171e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8323718756437302, + "step": 1775 + }, + { + "epoch": 1.7530454042081949, + "grad_norm": 0.21898606419563293, + "learning_rate": 8.799011033354716e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8307971671223641, + "step": 1780 + }, + { + "epoch": 1.7579672695951767, + "grad_norm": 0.2306451052427292, + "learning_rate": 8.742117693763229e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8316369831562043, + "step": 1785 + }, + { + "epoch": 1.7628891349821583, + "grad_norm": 0.22924001514911652, + "learning_rate": 8.685265694835681e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8311286598443985, + "step": 1790 + }, + { + "epoch": 1.7678110003691399, + "grad_norm": 0.33131736516952515, + "learning_rate": 8.628456905029383e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.833528995513916, + "step": 1795 + }, + { + "epoch": 1.7727328657561214, + "grad_norm": 0.24447475373744965, + "learning_rate": 8.571693191381545e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8324113413691521, + "step": 1800 + }, + { + "epoch": 1.7776547311431032, + "grad_norm": 0.23472720384597778, + "learning_rate": 8.514976419447963e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8306461483240127, + "step": 1805 + }, + { + "epoch": 1.782576596530085, + "grad_norm": 0.25232747197151184, + "learning_rate": 8.458308453241664e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8334705844521523, + "step": 1810 + }, + { + "epoch": 1.7874984619170666, + "grad_norm": 0.22827033698558807, + "learning_rate": 8.401691155171654e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8289692014455795, + "step": 1815 + }, + { + "epoch": 1.7924203273040482, + "grad_norm": 0.21775387227535248, + "learning_rate": 8.345126385981737e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8326601728796958, + "step": 1820 + }, + { + "epoch": 1.7973421926910298, + "grad_norm": 0.22691109776496887, + "learning_rate": 8.288616004689321e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8330274626612664, + "step": 1825 + }, + { + "epoch": 1.8022640580780116, + "grad_norm": 0.23031188547611237, + "learning_rate": 8.23216186852435e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8317318856716156, + "step": 1830 + }, + { + "epoch": 1.8071859234649932, + "grad_norm": 0.23658455908298492, + "learning_rate": 8.175765832868252e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8314035385847092, + "step": 1835 + }, + { + "epoch": 1.812107788851975, + "grad_norm": 0.21728812158107758, + "learning_rate": 8.119429751192972e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.830833038687706, + "step": 1840 + }, + { + "epoch": 1.8170296542389566, + "grad_norm": 0.22863180935382843, + "learning_rate": 8.063155475000037e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8322245612740516, + "step": 1845 + }, + { + "epoch": 1.8219515196259382, + "grad_norm": 0.22922097146511078, + "learning_rate": 8.006944853759732e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8318595319986344, + "step": 1850 + }, + { + "epoch": 1.8268733850129197, + "grad_norm": 0.209337517619133, + "learning_rate": 7.950799734850292e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8333837404847145, + "step": 1855 + }, + { + "epoch": 1.8317952503999015, + "grad_norm": 0.22603721916675568, + "learning_rate": 7.894721963497214e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8325009673833847, + "step": 1860 + }, + { + "epoch": 1.8367171157868833, + "grad_norm": 0.2327803522348404, + "learning_rate": 7.838713382712583e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8357574358582497, + "step": 1865 + }, + { + "epoch": 1.841638981173865, + "grad_norm": 0.23280593752861023, + "learning_rate": 7.782775833234522e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8295109212398529, + "step": 1870 + }, + { + "epoch": 1.8465608465608465, + "grad_norm": 0.2219589352607727, + "learning_rate": 7.726911153466699e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8316129177808762, + "step": 1875 + }, + { + "epoch": 1.851482711947828, + "grad_norm": 0.22274133563041687, + "learning_rate": 7.67112117941788e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8331713795661926, + "step": 1880 + }, + { + "epoch": 1.85640457733481, + "grad_norm": 0.20765641331672668, + "learning_rate": 7.615407744641618e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8323680445551872, + "step": 1885 + }, + { + "epoch": 1.8613264427217917, + "grad_norm": 0.22262942790985107, + "learning_rate": 7.559772680175979e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8315785735845566, + "step": 1890 + }, + { + "epoch": 1.8662483081087733, + "grad_norm": 0.23786763846874237, + "learning_rate": 7.504217814483364e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8326525434851646, + "step": 1895 + }, + { + "epoch": 1.8711701734957549, + "grad_norm": 0.22120903432369232, + "learning_rate": 7.448744973390423e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8296578034758568, + "step": 1900 + }, + { + "epoch": 1.8760920388827365, + "grad_norm": 0.22359086573123932, + "learning_rate": 7.393355980028039e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8320103421807289, + "step": 1905 + }, + { + "epoch": 1.8810139042697183, + "grad_norm": 0.21293464303016663, + "learning_rate": 7.338052654771407e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8330625906586647, + "step": 1910 + }, + { + "epoch": 1.8859357696566998, + "grad_norm": 0.212773397564888, + "learning_rate": 7.282836815180241e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8328917175531387, + "step": 1915 + }, + { + "epoch": 1.8908576350436817, + "grad_norm": 0.2229495495557785, + "learning_rate": 7.227710275938987e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8338592052459717, + "step": 1920 + }, + { + "epoch": 1.8957795004306632, + "grad_norm": 0.22714777290821075, + "learning_rate": 7.172674848797218e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8332103446125985, + "step": 1925 + }, + { + "epoch": 1.9007013658176448, + "grad_norm": 0.5862542986869812, + "learning_rate": 7.117732342510093e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8348309084773063, + "step": 1930 + }, + { + "epoch": 1.9056232312046264, + "grad_norm": 0.21524302661418915, + "learning_rate": 7.062884562778883e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8324376299977303, + "step": 1935 + }, + { + "epoch": 1.9105450965916082, + "grad_norm": 0.22445465624332428, + "learning_rate": 7.008133312191649e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8318991348147392, + "step": 1940 + }, + { + "epoch": 1.91546696197859, + "grad_norm": 0.21925503015518188, + "learning_rate": 6.953480390164001e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8320589557290077, + "step": 1945 + }, + { + "epoch": 1.9203888273655716, + "grad_norm": 0.21358764171600342, + "learning_rate": 6.898927592879945e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8309697136282921, + "step": 1950 + }, + { + "epoch": 1.9253106927525532, + "grad_norm": 0.21541139483451843, + "learning_rate": 6.844476713232863e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8336074352264404, + "step": 1955 + }, + { + "epoch": 1.9302325581395348, + "grad_norm": 0.253334105014801, + "learning_rate": 6.790129540766581e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8321399599313736, + "step": 1960 + }, + { + "epoch": 1.9351544235265166, + "grad_norm": 0.2311272770166397, + "learning_rate": 6.735887861616555e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.832192762196064, + "step": 1965 + }, + { + "epoch": 1.9400762889134984, + "grad_norm": 0.2155195027589798, + "learning_rate": 6.68175345845119e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8325791984796524, + "step": 1970 + }, + { + "epoch": 1.94499815430048, + "grad_norm": 0.2229234129190445, + "learning_rate": 6.627728110413214e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8320748254656791, + "step": 1975 + }, + { + "epoch": 1.9499200196874615, + "grad_norm": 0.2595667839050293, + "learning_rate": 6.5738135930612355e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.831524421274662, + "step": 1980 + }, + { + "epoch": 1.9548418850744431, + "grad_norm": 0.21894799172878265, + "learning_rate": 6.520011678311382e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8349313631653785, + "step": 1985 + }, + { + "epoch": 1.959763750461425, + "grad_norm": 0.215131938457489, + "learning_rate": 6.466324134379066e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8354373678565026, + "step": 1990 + }, + { + "epoch": 1.9646856158484065, + "grad_norm": 0.227864071726799, + "learning_rate": 6.412752725720864e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8339696109294892, + "step": 1995 + }, + { + "epoch": 1.9696074812353883, + "grad_norm": 0.21633465588092804, + "learning_rate": 6.359299212976535e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8324458003044128, + "step": 2000 + }, + { + "epoch": 1.97452934662237, + "grad_norm": 0.2214214950799942, + "learning_rate": 6.305965352911162e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8334563329815865, + "step": 2005 + }, + { + "epoch": 1.9794512120093515, + "grad_norm": 0.20772044360637665, + "learning_rate": 6.252752898357397e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8346970349550247, + "step": 2010 + }, + { + "epoch": 1.984373077396333, + "grad_norm": 0.2208469659090042, + "learning_rate": 6.1996635981578755e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8330862745642662, + "step": 2015 + }, + { + "epoch": 1.9892949427833149, + "grad_norm": 0.21841764450073242, + "learning_rate": 6.146699197107715e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8346462666988372, + "step": 2020 + }, + { + "epoch": 1.9942168081702967, + "grad_norm": 0.22905802726745605, + "learning_rate": 6.093861435897208e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8341751024127007, + "step": 2025 + }, + { + "epoch": 1.9991386735572783, + "grad_norm": 0.2205893099308014, + "learning_rate": 6.041152051054575e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8350084885954857, + "step": 2030 + }, + { + "epoch": 2.0049218653869816, + "grad_norm": 0.27798768877983093, + "learning_rate": 5.988572774888913e-06, + "loss": 0.5979, + "mean_token_accuracy": 0.8386082910909886, + "step": 2035 + }, + { + "epoch": 2.009843730773963, + "grad_norm": 0.24996507167816162, + "learning_rate": 5.936125335433265e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.839720045030117, + "step": 2040 + }, + { + "epoch": 2.014765596160945, + "grad_norm": 0.2548527121543884, + "learning_rate": 5.883811456387821e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8400543674826622, + "step": 2045 + }, + { + "epoch": 2.0196874615479268, + "grad_norm": 0.2184976190328598, + "learning_rate": 5.831632857063271e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8409830510616303, + "step": 2050 + }, + { + "epoch": 2.0246093269349084, + "grad_norm": 0.22762830555438995, + "learning_rate": 5.779591252324286e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8408440828323365, + "step": 2055 + }, + { + "epoch": 2.02953119232189, + "grad_norm": 0.23035886883735657, + "learning_rate": 5.7276883525331915e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8397367835044861, + "step": 2060 + }, + { + "epoch": 2.0344530577088715, + "grad_norm": 0.22349004447460175, + "learning_rate": 5.675925863493721e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8379953891038895, + "step": 2065 + }, + { + "epoch": 2.0393749230958536, + "grad_norm": 0.22588923573493958, + "learning_rate": 5.6243054863949675e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8397265374660492, + "step": 2070 + }, + { + "epoch": 2.044296788482835, + "grad_norm": 0.2168150246143341, + "learning_rate": 5.5728289177554805e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8389487206935883, + "step": 2075 + }, + { + "epoch": 2.0492186538698167, + "grad_norm": 0.22331282496452332, + "learning_rate": 5.521497849367501e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8422671511769295, + "step": 2080 + }, + { + "epoch": 2.0541405192567983, + "grad_norm": 0.21221551299095154, + "learning_rate": 5.4703139682413585e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8420242533087731, + "step": 2085 + }, + { + "epoch": 2.05906238464378, + "grad_norm": 0.22058208286762238, + "learning_rate": 5.419278956550037e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8394055813550949, + "step": 2090 + }, + { + "epoch": 2.0639842500307615, + "grad_norm": 0.22200560569763184, + "learning_rate": 5.368394491573876e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8402127623558044, + "step": 2095 + }, + { + "epoch": 2.0689061154177435, + "grad_norm": 0.2220141738653183, + "learning_rate": 5.31766224564547e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8393116250634194, + "step": 2100 + }, + { + "epoch": 2.073827980804725, + "grad_norm": 0.21074913442134857, + "learning_rate": 5.267083886094668e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.840206652879715, + "step": 2105 + }, + { + "epoch": 2.0787498461917067, + "grad_norm": 0.2276320606470108, + "learning_rate": 5.216661075193814e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8393134921789169, + "step": 2110 + }, + { + "epoch": 2.0836717115786882, + "grad_norm": 0.2224099338054657, + "learning_rate": 5.166395470103092e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8397904768586159, + "step": 2115 + }, + { + "epoch": 2.08859357696567, + "grad_norm": 0.22312206029891968, + "learning_rate": 5.116288722816087e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8403119757771492, + "step": 2120 + }, + { + "epoch": 2.093515442352652, + "grad_norm": 0.2194313257932663, + "learning_rate": 5.06634248010546e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8400413483381272, + "step": 2125 + }, + { + "epoch": 2.0984373077396334, + "grad_norm": 0.22484691441059113, + "learning_rate": 5.016558383468851e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8409391462802887, + "step": 2130 + }, + { + "epoch": 2.103359173126615, + "grad_norm": 0.22470517456531525, + "learning_rate": 4.9669380690749215e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8389460816979408, + "step": 2135 + }, + { + "epoch": 2.1082810385135966, + "grad_norm": 0.21832752227783203, + "learning_rate": 4.91748316770958e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8401527449488639, + "step": 2140 + }, + { + "epoch": 2.113202903900578, + "grad_norm": 0.21521726250648499, + "learning_rate": 4.868195304722391e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8387278065085411, + "step": 2145 + }, + { + "epoch": 2.1181247692875598, + "grad_norm": 0.21682803332805634, + "learning_rate": 4.819076099973152e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.83763497620821, + "step": 2150 + }, + { + "epoch": 2.123046634674542, + "grad_norm": 0.2204725295305252, + "learning_rate": 4.77012716777867e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8380599915981293, + "step": 2155 + }, + { + "epoch": 2.1279685000615234, + "grad_norm": 0.2179991751909256, + "learning_rate": 4.721350116859675e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8396460056304932, + "step": 2160 + }, + { + "epoch": 2.132890365448505, + "grad_norm": 0.21851445734500885, + "learning_rate": 4.672746550287985e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8395410850644112, + "step": 2165 + }, + { + "epoch": 2.1378122308354865, + "grad_norm": 0.21560297906398773, + "learning_rate": 4.6243180654337975e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8421663656830788, + "step": 2170 + }, + { + "epoch": 2.142734096222468, + "grad_norm": 0.21567942202091217, + "learning_rate": 4.576066253913209e-06, + "loss": 0.493, + "mean_token_accuracy": 0.840301775932312, + "step": 2175 + }, + { + "epoch": 2.14765596160945, + "grad_norm": 0.22145864367485046, + "learning_rate": 4.527992701535884e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8423144072294235, + "step": 2180 + }, + { + "epoch": 2.1525778269964317, + "grad_norm": 0.217710942029953, + "learning_rate": 4.480098988252958e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.84017314016819, + "step": 2185 + }, + { + "epoch": 2.1574996923834133, + "grad_norm": 0.2169259786605835, + "learning_rate": 4.432386688105095e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.840173925459385, + "step": 2190 + }, + { + "epoch": 2.162421557770395, + "grad_norm": 0.21104402840137482, + "learning_rate": 4.384857369170772e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8417868033051491, + "step": 2195 + }, + { + "epoch": 2.1673434231573765, + "grad_norm": 0.21658702194690704, + "learning_rate": 4.337512593514729e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8395476669073105, + "step": 2200 + }, + { + "epoch": 2.1722652885443585, + "grad_norm": 0.22858913242816925, + "learning_rate": 4.290353917136639e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8408517464995384, + "step": 2205 + }, + { + "epoch": 2.17718715393134, + "grad_norm": 0.4094144105911255, + "learning_rate": 4.243382889919981e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8392629832029342, + "step": 2210 + }, + { + "epoch": 2.1821090193183217, + "grad_norm": 0.21924547851085663, + "learning_rate": 4.1966010555810696e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.841227824985981, + "step": 2215 + }, + { + "epoch": 2.1870308847053033, + "grad_norm": 0.21283064782619476, + "learning_rate": 4.1500099516183555e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8405321702361107, + "step": 2220 + }, + { + "epoch": 2.191952750092285, + "grad_norm": 0.21150268614292145, + "learning_rate": 4.1036111092618725e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8410715743899345, + "step": 2225 + }, + { + "epoch": 2.1968746154792664, + "grad_norm": 0.20887652039527893, + "learning_rate": 4.057406053422933e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8398977249860764, + "step": 2230 + }, + { + "epoch": 2.2017964808662485, + "grad_norm": 0.20756816864013672, + "learning_rate": 4.011396302643989e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.842858923971653, + "step": 2235 + }, + { + "epoch": 2.20671834625323, + "grad_norm": 0.23419924080371857, + "learning_rate": 3.965583369048737e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8392103880643844, + "step": 2240 + }, + { + "epoch": 2.2116402116402116, + "grad_norm": 0.21532607078552246, + "learning_rate": 3.919968758292425e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8413224458694458, + "step": 2245 + }, + { + "epoch": 2.216562077027193, + "grad_norm": 0.2164084017276764, + "learning_rate": 3.874553969512358e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8415488794445991, + "step": 2250 + }, + { + "epoch": 2.221483942414175, + "grad_norm": 0.21010589599609375, + "learning_rate": 3.82934049527864e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8404750242829323, + "step": 2255 + }, + { + "epoch": 2.226405807801157, + "grad_norm": 0.20962242782115936, + "learning_rate": 3.784329821545105e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.839095975458622, + "step": 2260 + }, + { + "epoch": 2.2313276731881384, + "grad_norm": 0.20551133155822754, + "learning_rate": 3.739523427600509e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8407798200845719, + "step": 2265 + }, + { + "epoch": 2.23624953857512, + "grad_norm": 0.21332746744155884, + "learning_rate": 3.6949227860198712e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8405194252729415, + "step": 2270 + }, + { + "epoch": 2.2411714039621016, + "grad_norm": 0.26087722182273865, + "learning_rate": 3.650529362616113e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8417001351714134, + "step": 2275 + }, + { + "epoch": 2.246093269349083, + "grad_norm": 0.20974403619766235, + "learning_rate": 3.606344616391867e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8395893201231956, + "step": 2280 + }, + { + "epoch": 2.2510151347360647, + "grad_norm": 0.22249352931976318, + "learning_rate": 3.5623699994915363e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.840800578892231, + "step": 2285 + }, + { + "epoch": 2.2559370001230468, + "grad_norm": 0.20673160254955292, + "learning_rate": 3.5186069571535575e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8417642295360566, + "step": 2290 + }, + { + "epoch": 2.2608588655100283, + "grad_norm": 0.2050849050283432, + "learning_rate": 3.475056927662912e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8401932448148728, + "step": 2295 + }, + { + "epoch": 2.26578073089701, + "grad_norm": 0.2113514542579651, + "learning_rate": 3.4317213423038386e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8401719897985458, + "step": 2300 + }, + { + "epoch": 2.2707025962839915, + "grad_norm": 0.21461407840251923, + "learning_rate": 3.388601625312833e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.841229310631752, + "step": 2305 + }, + { + "epoch": 2.275624461670973, + "grad_norm": 0.20549601316452026, + "learning_rate": 3.345699193831795e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8405207619071007, + "step": 2310 + }, + { + "epoch": 2.280546327057955, + "grad_norm": 0.21262629330158234, + "learning_rate": 3.3030154578614783e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8410497605800629, + "step": 2315 + }, + { + "epoch": 2.2854681924449367, + "grad_norm": 0.2351827323436737, + "learning_rate": 3.2605518202151577e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8394208237528801, + "step": 2320 + }, + { + "epoch": 2.2903900578319183, + "grad_norm": 0.21704116463661194, + "learning_rate": 3.218309676472492e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8411409676074981, + "step": 2325 + }, + { + "epoch": 2.2953119232189, + "grad_norm": 0.20750364661216736, + "learning_rate": 3.1762904149336947e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8396940395236016, + "step": 2330 + }, + { + "epoch": 2.3002337886058815, + "grad_norm": 0.20055250823497772, + "learning_rate": 3.134495416573884e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8417407006025315, + "step": 2335 + }, + { + "epoch": 2.3051556539928635, + "grad_norm": 0.20621967315673828, + "learning_rate": 3.0929260549977116e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8415425732731819, + "step": 2340 + }, + { + "epoch": 2.310077519379845, + "grad_norm": 0.210305854678154, + "learning_rate": 3.0515836963942056e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8403278931975364, + "step": 2345 + }, + { + "epoch": 2.3149993847668267, + "grad_norm": 0.25147390365600586, + "learning_rate": 3.01046969949188e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8407050803303718, + "step": 2350 + }, + { + "epoch": 2.3199212501538082, + "grad_norm": 0.21020571887493134, + "learning_rate": 2.9695854155140648e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8410211369395256, + "step": 2355 + }, + { + "epoch": 2.32484311554079, + "grad_norm": 0.21094508469104767, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.841056476533413, + "step": 2360 + }, + { + "epoch": 2.329764980927772, + "grad_norm": 0.21813294291496277, + "learning_rate": 2.8885113534332742e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8402146637439728, + "step": 2365 + }, + { + "epoch": 2.3346868463147534, + "grad_norm": 0.21038471162319183, + "learning_rate": 2.8483242398526723e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8416903391480446, + "step": 2370 + }, + { + "epoch": 2.339608711701735, + "grad_norm": 0.21476763486862183, + "learning_rate": 2.80837216815378e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8410104081034661, + "step": 2375 + }, + { + "epoch": 2.3445305770887166, + "grad_norm": 0.2148827761411667, + "learning_rate": 2.7686564513729198e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8401752710342407, + "step": 2380 + }, + { + "epoch": 2.349452442475698, + "grad_norm": 0.20347550511360168, + "learning_rate": 2.7291783947785544e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.841368468105793, + "step": 2385 + }, + { + "epoch": 2.35437430786268, + "grad_norm": 0.2156437486410141, + "learning_rate": 2.689939295828371e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8401880413293839, + "step": 2390 + }, + { + "epoch": 2.359296173249662, + "grad_norm": 0.20905110239982605, + "learning_rate": 2.650940444126654e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8407162860035896, + "step": 2395 + }, + { + "epoch": 2.3642180386366434, + "grad_norm": 0.20476758480072021, + "learning_rate": 2.6121831213818825e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.840287271142006, + "step": 2400 + }, + { + "epoch": 2.369139904023625, + "grad_norm": 0.1986178457736969, + "learning_rate": 2.5736686013646226e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8420573100447655, + "step": 2405 + }, + { + "epoch": 2.3740617694106065, + "grad_norm": 0.21784992516040802, + "learning_rate": 2.535398149865651e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8410965353250504, + "step": 2410 + }, + { + "epoch": 2.378983634797588, + "grad_norm": 0.20018485188484192, + "learning_rate": 2.4973730246543736e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8406006515026092, + "step": 2415 + }, + { + "epoch": 2.3839055001845697, + "grad_norm": 0.21187762916088104, + "learning_rate": 2.4595944754374723e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8388384222984314, + "step": 2420 + }, + { + "epoch": 2.3888273655715517, + "grad_norm": 0.2048918604850769, + "learning_rate": 2.422063743817832e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8397043973207474, + "step": 2425 + }, + { + "epoch": 2.3937492309585333, + "grad_norm": 0.2068692445755005, + "learning_rate": 2.3847820632537565e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8392092302441597, + "step": 2430 + }, + { + "epoch": 2.398671096345515, + "grad_norm": 0.2050062119960785, + "learning_rate": 2.347750659018397e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8390960440039634, + "step": 2435 + }, + { + "epoch": 2.4035929617324965, + "grad_norm": 0.20241810381412506, + "learning_rate": 2.3109707481595113e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8431760326027871, + "step": 2440 + }, + { + "epoch": 2.408514827119478, + "grad_norm": 0.2023165076971054, + "learning_rate": 2.27444353945945e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8427256375551224, + "step": 2445 + }, + { + "epoch": 2.41343669250646, + "grad_norm": 0.2395012527704239, + "learning_rate": 2.2381702333954436e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8425970792770385, + "step": 2450 + }, + { + "epoch": 2.4183585578934417, + "grad_norm": 0.20210982859134674, + "learning_rate": 2.2021520221001304e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8415813356637954, + "step": 2455 + }, + { + "epoch": 2.4232804232804233, + "grad_norm": 0.2082945853471756, + "learning_rate": 2.16639008932239e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8398790895938874, + "step": 2460 + }, + { + "epoch": 2.428202288667405, + "grad_norm": 0.20752127468585968, + "learning_rate": 2.130885610388428e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.839399340748787, + "step": 2465 + }, + { + "epoch": 2.4331241540543864, + "grad_norm": 0.20869506895542145, + "learning_rate": 2.0956397521631666e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8415920332074165, + "step": 2470 + }, + { + "epoch": 2.4380460194413685, + "grad_norm": 0.20477741956710815, + "learning_rate": 2.0606536730118767e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8429039210081101, + "step": 2475 + }, + { + "epoch": 2.44296788482835, + "grad_norm": 0.20474423468112946, + "learning_rate": 2.0259285227621152e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8382045805454255, + "step": 2480 + }, + { + "epoch": 2.4478897502153316, + "grad_norm": 0.20369385182857513, + "learning_rate": 1.9914654426659374e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.839960803091526, + "step": 2485 + }, + { + "epoch": 2.452811615602313, + "grad_norm": 0.2068207710981369, + "learning_rate": 1.9572655653623884e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8397150009870529, + "step": 2490 + }, + { + "epoch": 2.457733480989295, + "grad_norm": 0.20661979913711548, + "learning_rate": 1.9233300148402767e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8401017665863038, + "step": 2495 + }, + { + "epoch": 2.462655346376277, + "grad_norm": 0.21355277299880981, + "learning_rate": 1.88965990640123e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8420075699687004, + "step": 2500 + }, + { + "epoch": 2.4675772117632584, + "grad_norm": 0.209817573428154, + "learning_rate": 1.8562563466230577e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8402795165777206, + "step": 2505 + }, + { + "epoch": 2.47249907715024, + "grad_norm": 0.1972341388463974, + "learning_rate": 1.823120433323361e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8408435776829719, + "step": 2510 + }, + { + "epoch": 2.4774209425372216, + "grad_norm": 0.20761115849018097, + "learning_rate": 1.7902532555234653e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.838873790204525, + "step": 2515 + }, + { + "epoch": 2.482342807924203, + "grad_norm": 0.22367697954177856, + "learning_rate": 1.757655893412622e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8413331776857376, + "step": 2520 + }, + { + "epoch": 2.487264673311185, + "grad_norm": 0.20876270532608032, + "learning_rate": 1.7253294183125223e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8411200374364853, + "step": 2525 + }, + { + "epoch": 2.4921865386981668, + "grad_norm": 0.20132075250148773, + "learning_rate": 1.6932748926420695e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8395631924271584, + "step": 2530 + }, + { + "epoch": 2.4971084040851483, + "grad_norm": 0.1999741941690445, + "learning_rate": 1.661493369882473e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.843748077750206, + "step": 2535 + }, + { + "epoch": 2.50203026947213, + "grad_norm": 0.21044902503490448, + "learning_rate": 1.6299858945426251e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8423863723874092, + "step": 2540 + }, + { + "epoch": 2.5069521348591115, + "grad_norm": 0.19819578528404236, + "learning_rate": 1.5987535021247668e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8423318341374397, + "step": 2545 + }, + { + "epoch": 2.5118740002460935, + "grad_norm": 0.2015785425901413, + "learning_rate": 1.5677972190904623e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8417120486497879, + "step": 2550 + }, + { + "epoch": 2.5167958656330747, + "grad_norm": 0.20403100550174713, + "learning_rate": 1.537118062826859e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8435953631997108, + "step": 2555 + }, + { + "epoch": 2.5217177310200567, + "grad_norm": 0.2051580399274826, + "learning_rate": 1.5067170416132603e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.842904870212078, + "step": 2560 + }, + { + "epoch": 2.5266395964070383, + "grad_norm": 0.20559805631637573, + "learning_rate": 1.4765951545879732e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8392938315868378, + "step": 2565 + }, + { + "epoch": 2.53156146179402, + "grad_norm": 0.21315298974514008, + "learning_rate": 1.4467533917154842e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8433891490101815, + "step": 2570 + }, + { + "epoch": 2.5364833271810014, + "grad_norm": 0.33885088562965393, + "learning_rate": 1.4171927337539103e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8398235127329826, + "step": 2575 + }, + { + "epoch": 2.541405192567983, + "grad_norm": 0.19653761386871338, + "learning_rate": 1.3879141522227878e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8408400386571884, + "step": 2580 + }, + { + "epoch": 2.546327057954965, + "grad_norm": 0.19870713353157043, + "learning_rate": 1.3589186093711227e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8433947190642357, + "step": 2585 + }, + { + "epoch": 2.5512489233419466, + "grad_norm": 0.20051565766334534, + "learning_rate": 1.3302070581457716e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.838576278090477, + "step": 2590 + }, + { + "epoch": 2.5561707887289282, + "grad_norm": 0.2312447875738144, + "learning_rate": 1.3017804421601298e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8404266074299812, + "step": 2595 + }, + { + "epoch": 2.56109265411591, + "grad_norm": 0.21526625752449036, + "learning_rate": 1.273639695663108e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8403177246451378, + "step": 2600 + }, + { + "epoch": 2.5660145195028914, + "grad_norm": 0.4974516034126282, + "learning_rate": 1.245785743508441e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8414172142744064, + "step": 2605 + }, + { + "epoch": 2.5709363848898734, + "grad_norm": 0.19956116378307343, + "learning_rate": 1.2182195011242747e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.837465213239193, + "step": 2610 + }, + { + "epoch": 2.575858250276855, + "grad_norm": 0.19986701011657715, + "learning_rate": 1.1909418744831048e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8414024114608765, + "step": 2615 + }, + { + "epoch": 2.5807801156638366, + "grad_norm": 0.20174540579319, + "learning_rate": 1.1639537600719764e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8420050874352455, + "step": 2620 + }, + { + "epoch": 2.585701981050818, + "grad_norm": 0.20654183626174927, + "learning_rate": 1.1372560448630377e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8395126640796662, + "step": 2625 + }, + { + "epoch": 2.5906238464377997, + "grad_norm": 0.19598302245140076, + "learning_rate": 1.1108496062843743e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8420949026942253, + "step": 2630 + }, + { + "epoch": 2.5955457118247818, + "grad_norm": 0.20486712455749512, + "learning_rate": 1.0847353121911952e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8409939989447593, + "step": 2635 + }, + { + "epoch": 2.6004675772117634, + "grad_norm": 0.2051970511674881, + "learning_rate": 1.0589140208372872e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8416621774435044, + "step": 2640 + }, + { + "epoch": 2.605389442598745, + "grad_norm": 0.20128969848155975, + "learning_rate": 1.0333865808468203e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8431450635194778, + "step": 2645 + }, + { + "epoch": 2.6103113079857265, + "grad_norm": 0.2007114738225937, + "learning_rate": 1.008153831186457e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8406037405133248, + "step": 2650 + }, + { + "epoch": 2.615233173372708, + "grad_norm": 0.19757139682769775, + "learning_rate": 9.83216601137773e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8414921492338181, + "step": 2655 + }, + { + "epoch": 2.62015503875969, + "grad_norm": 0.21764694154262543, + "learning_rate": 9.58575710270011e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8431682124733925, + "step": 2660 + }, + { + "epoch": 2.6250769041466717, + "grad_norm": 0.20229902863502502, + "learning_rate": 9.342319684131396e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8404648944735527, + "step": 2665 + }, + { + "epoch": 2.6299987695336533, + "grad_norm": 0.22413024306297302, + "learning_rate": 9.101861756312369e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8410172060132026, + "step": 2670 + }, + { + "epoch": 2.634920634920635, + "grad_norm": 0.1993047147989273, + "learning_rate": 8.864391221962065e-07, + "loss": 0.488, + "mean_token_accuracy": 0.841397476196289, + "step": 2675 + }, + { + "epoch": 2.6398425003076165, + "grad_norm": 0.20383085310459137, + "learning_rate": 8.629915885617912e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8405807599425316, + "step": 2680 + }, + { + "epoch": 2.6447643656945985, + "grad_norm": 0.19943130016326904, + "learning_rate": 8.398443453379268e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.841593649983406, + "step": 2685 + }, + { + "epoch": 2.64968623108158, + "grad_norm": 0.19960327446460724, + "learning_rate": 8.169981532654269e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8422250881791115, + "step": 2690 + }, + { + "epoch": 2.6546080964685617, + "grad_norm": 0.20726507902145386, + "learning_rate": 7.944537631909666e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8422259956598281, + "step": 2695 + }, + { + "epoch": 2.6595299618555432, + "grad_norm": 0.19812346994876862, + "learning_rate": 7.722119160424113e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.842007802426815, + "step": 2700 + }, + { + "epoch": 2.664451827242525, + "grad_norm": 0.19591908156871796, + "learning_rate": 7.502733428044684e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8423181056976319, + "step": 2705 + }, + { + "epoch": 2.669373692629507, + "grad_norm": 0.195572167634964, + "learning_rate": 7.286387644946602e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8387840166687965, + "step": 2710 + }, + { + "epoch": 2.674295558016488, + "grad_norm": 0.2031807154417038, + "learning_rate": 7.073088921396287e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.840399731695652, + "step": 2715 + }, + { + "epoch": 2.67921742340347, + "grad_norm": 0.2004314363002777, + "learning_rate": 6.862844267517643e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8423734799027442, + "step": 2720 + }, + { + "epoch": 2.6841392887904516, + "grad_norm": 0.20816642045974731, + "learning_rate": 6.655660593061719e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.8385626211762428, + "step": 2725 + }, + { + "epoch": 2.689061154177433, + "grad_norm": 0.20351089537143707, + "learning_rate": 6.451544707179635e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8395294427871705, + "step": 2730 + }, + { + "epoch": 2.6939830195644148, + "grad_norm": 0.20076881349086761, + "learning_rate": 6.250503318198664e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8412301942706109, + "step": 2735 + }, + { + "epoch": 2.6989048849513964, + "grad_norm": 0.25244539976119995, + "learning_rate": 6.052543033401892e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8402833178639412, + "step": 2740 + }, + { + "epoch": 2.7038267503383784, + "grad_norm": 0.2058088779449463, + "learning_rate": 5.857670358811096e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8405940279364585, + "step": 2745 + }, + { + "epoch": 2.70874861572536, + "grad_norm": 0.2002749741077423, + "learning_rate": 5.665891698972769e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.8391197189688683, + "step": 2750 + }, + { + "epoch": 2.7136704811123415, + "grad_norm": 0.19865228235721588, + "learning_rate": 5.477213356747746e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8410469844937325, + "step": 2755 + }, + { + "epoch": 2.718592346499323, + "grad_norm": 0.20059484243392944, + "learning_rate": 5.291641533104053e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8434463173151017, + "step": 2760 + }, + { + "epoch": 2.7235142118863047, + "grad_norm": 0.19962534308433533, + "learning_rate": 5.109182326913053e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8433682397007942, + "step": 2765 + }, + { + "epoch": 2.7284360772732867, + "grad_norm": 0.1976374238729477, + "learning_rate": 4.929841734749063e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8429444268345833, + "step": 2770 + }, + { + "epoch": 2.7333579426602683, + "grad_norm": 0.1919257491827011, + "learning_rate": 4.7536256506922507e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8420413583517075, + "step": 2775 + }, + { + "epoch": 2.73827980804725, + "grad_norm": 0.21447736024856567, + "learning_rate": 4.580539866134914e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8408365085721016, + "step": 2780 + }, + { + "epoch": 2.7432016734342315, + "grad_norm": 0.20053516328334808, + "learning_rate": 4.410590069591192e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8403174698352813, + "step": 2785 + }, + { + "epoch": 2.748123538821213, + "grad_norm": 0.3303152620792389, + "learning_rate": 4.2437818465100313e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8434215649962425, + "step": 2790 + }, + { + "epoch": 2.753045404208195, + "grad_norm": 0.194558247923851, + "learning_rate": 4.0801206790916815e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8438364923000335, + "step": 2795 + }, + { + "epoch": 2.7579672695951767, + "grad_norm": 0.19499559700489044, + "learning_rate": 3.919611946107493e-07, + "loss": 0.4825, + "mean_token_accuracy": 0.8429989367723465, + "step": 2800 + }, + { + "epoch": 2.7628891349821583, + "grad_norm": 0.19578364491462708, + "learning_rate": 3.762260922723182e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8416179150342942, + "step": 2805 + }, + { + "epoch": 2.76781100036914, + "grad_norm": 0.20279313623905182, + "learning_rate": 3.6080727803254003e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.8406556889414787, + "step": 2810 + }, + { + "epoch": 2.7727328657561214, + "grad_norm": 0.20414599776268005, + "learning_rate": 3.457052586351817e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.8403137296438217, + "step": 2815 + }, + { + "epoch": 2.7776547311431035, + "grad_norm": 0.20257827639579773, + "learning_rate": 3.309205304124552e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.841057425737381, + "step": 2820 + }, + { + "epoch": 2.782576596530085, + "grad_norm": 0.19924387335777283, + "learning_rate": 3.1645357926870957e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8389097020030022, + "step": 2825 + }, + { + "epoch": 2.7874984619170666, + "grad_norm": 0.20351967215538025, + "learning_rate": 3.0230488066445465e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8404456153512001, + "step": 2830 + }, + { + "epoch": 2.792420327304048, + "grad_norm": 0.199168398976326, + "learning_rate": 2.8847489960074136e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8398653537034988, + "step": 2835 + }, + { + "epoch": 2.79734219269103, + "grad_norm": 0.19794094562530518, + "learning_rate": 2.7496409060387973e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8388495057821274, + "step": 2840 + }, + { + "epoch": 2.802264058078012, + "grad_norm": 0.19937507808208466, + "learning_rate": 2.6177289771049274e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8410208597779274, + "step": 2845 + }, + { + "epoch": 2.807185923464993, + "grad_norm": 0.19925516843795776, + "learning_rate": 2.489017544529315e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8415358811616898, + "step": 2850 + }, + { + "epoch": 2.812107788851975, + "grad_norm": 0.19592879712581635, + "learning_rate": 2.3635108384502003e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.839320321381092, + "step": 2855 + }, + { + "epoch": 2.8170296542389566, + "grad_norm": 0.19561193883419037, + "learning_rate": 2.2412129836816287e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.840375654399395, + "step": 2860 + }, + { + "epoch": 2.821951519625938, + "grad_norm": 0.1935349404811859, + "learning_rate": 2.1221279995777833e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8416187852621079, + "step": 2865 + }, + { + "epoch": 2.8268733850129197, + "grad_norm": 0.19886697828769684, + "learning_rate": 2.0062597999009114e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8432388514280319, + "step": 2870 + }, + { + "epoch": 2.8317952503999013, + "grad_norm": 0.19826510548591614, + "learning_rate": 1.8936121926927508e-07, + "loss": 0.49, + "mean_token_accuracy": 0.8409401133656502, + "step": 2875 + }, + { + "epoch": 2.8367171157868833, + "grad_norm": 0.21422724425792694, + "learning_rate": 1.7841888801493178e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.840906199812889, + "step": 2880 + }, + { + "epoch": 2.841638981173865, + "grad_norm": 0.2021849900484085, + "learning_rate": 1.677993458499272e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8416887044906616, + "step": 2885 + }, + { + "epoch": 2.8465608465608465, + "grad_norm": 0.19902034103870392, + "learning_rate": 1.5750294178856872e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8414162322878838, + "step": 2890 + }, + { + "epoch": 2.851482711947828, + "grad_norm": 0.19861221313476562, + "learning_rate": 1.4753001422514125e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8401012614369392, + "step": 2895 + }, + { + "epoch": 2.8564045773348097, + "grad_norm": 0.19735361635684967, + "learning_rate": 1.378808909227769e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8422791570425033, + "step": 2900 + }, + { + "epoch": 2.8613264427217917, + "grad_norm": 0.20118270814418793, + "learning_rate": 1.2855588900269057e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8406861796975136, + "step": 2905 + }, + { + "epoch": 2.8662483081087733, + "grad_norm": 0.19249391555786133, + "learning_rate": 1.1955531493375137e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8438849881291389, + "step": 2910 + }, + { + "epoch": 2.871170173495755, + "grad_norm": 0.19686251878738403, + "learning_rate": 1.1087946452241871e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.8399393901228904, + "step": 2915 + }, + { + "epoch": 2.8760920388827365, + "grad_norm": 0.1956326812505722, + "learning_rate": 1.0252862290301092e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.841577798128128, + "step": 2920 + }, + { + "epoch": 2.881013904269718, + "grad_norm": 0.2053905874490738, + "learning_rate": 9.45030645283418e-08, + "loss": 0.4897, + "mean_token_accuracy": 0.8410707041621208, + "step": 2925 + }, + { + "epoch": 2.8859357696567, + "grad_norm": 0.19495834410190582, + "learning_rate": 8.68030531606967e-08, + "loss": 0.4927, + "mean_token_accuracy": 0.8402184978127479, + "step": 2930 + }, + { + "epoch": 2.8908576350436817, + "grad_norm": 0.1992396116256714, + "learning_rate": 7.94288418631639e-08, + "loss": 0.4857, + "mean_token_accuracy": 0.842261828482151, + "step": 2935 + }, + { + "epoch": 2.8957795004306632, + "grad_norm": 0.20448440313339233, + "learning_rate": 7.238067299131901e-08, + "loss": 0.4907, + "mean_token_accuracy": 0.841072927415371, + "step": 2940 + }, + { + "epoch": 2.900701365817645, + "grad_norm": 0.19940471649169922, + "learning_rate": 6.565877818526245e-08, + "loss": 0.4886, + "mean_token_accuracy": 0.8412072688341141, + "step": 2945 + }, + { + "epoch": 2.9056232312046264, + "grad_norm": 0.19256047904491425, + "learning_rate": 5.926337836199891e-08, + "loss": 0.4867, + "mean_token_accuracy": 0.8416444838047028, + "step": 2950 + }, + { + "epoch": 2.9105450965916084, + "grad_norm": 0.19797919690608978, + "learning_rate": 5.319468370818537e-08, + "loss": 0.4897, + "mean_token_accuracy": 0.8410748258233071, + "step": 2955 + }, + { + "epoch": 2.91546696197859, + "grad_norm": 0.1998082846403122, + "learning_rate": 4.7452893673216596e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.8427498519420624, + "step": 2960 + }, + { + "epoch": 2.9203888273655716, + "grad_norm": 0.19540701806545258, + "learning_rate": 4.203819696267486e-08, + "loss": 0.4907, + "mean_token_accuracy": 0.8408638656139373, + "step": 2965 + }, + { + "epoch": 2.925310692752553, + "grad_norm": 0.19913552701473236, + "learning_rate": 3.6950771532126004e-08, + "loss": 0.4983, + "mean_token_accuracy": 0.8385754480957985, + "step": 2970 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 0.19257843494415283, + "learning_rate": 3.2190784581270786e-08, + "loss": 0.4878, + "mean_token_accuracy": 0.841645573079586, + "step": 2975 + }, + { + "epoch": 2.935154423526517, + "grad_norm": 0.19568364322185516, + "learning_rate": 2.7758392548449253e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.8412896126508713, + "step": 2980 + }, + { + "epoch": 2.9400762889134984, + "grad_norm": 0.20067226886749268, + "learning_rate": 2.3653741105499338e-08, + "loss": 0.4836, + "mean_token_accuracy": 0.8427690804004669, + "step": 2985 + }, + { + "epoch": 2.94499815430048, + "grad_norm": 0.19799287617206573, + "learning_rate": 1.9876965152975102e-08, + "loss": 0.4895, + "mean_token_accuracy": 0.8405489608645439, + "step": 2990 + }, + { + "epoch": 2.9499200196874615, + "grad_norm": 1.0325350761413574, + "learning_rate": 1.6428188815703627e-08, + "loss": 0.4896, + "mean_token_accuracy": 0.8411920800805092, + "step": 2995 + }, + { + "epoch": 2.954841885074443, + "grad_norm": 0.1966339498758316, + "learning_rate": 1.3307525438711611e-08, + "loss": 0.488, + "mean_token_accuracy": 0.841396550834179, + "step": 3000 + }, + { + "epoch": 2.959763750461425, + "grad_norm": 0.2234841138124466, + "learning_rate": 1.0515077583498346e-08, + "loss": 0.4911, + "mean_token_accuracy": 0.8406392633914948, + "step": 3005 + }, + { + "epoch": 2.9646856158484063, + "grad_norm": 0.27488455176353455, + "learning_rate": 8.050937024666195e-09, + "loss": 0.4942, + "mean_token_accuracy": 0.8396434351801872, + "step": 3010 + }, + { + "epoch": 2.9696074812353883, + "grad_norm": 0.1911349892616272, + "learning_rate": 5.9151847469041125e-09, + "loss": 0.4823, + "mean_token_accuracy": 0.8430395260453224, + "step": 3015 + }, + { + "epoch": 2.97452934662237, + "grad_norm": 0.19882096350193024, + "learning_rate": 4.1078909423253325e-09, + "loss": 0.4995, + "mean_token_accuracy": 0.8379872292280197, + "step": 3020 + }, + { + "epoch": 2.9794512120093515, + "grad_norm": 0.20069076120853424, + "learning_rate": 2.629115008160321e-09, + "loss": 0.4964, + "mean_token_accuracy": 0.8388297706842422, + "step": 3025 + }, + { + "epoch": 2.984373077396333, + "grad_norm": 0.19437766075134277, + "learning_rate": 1.4789055448061195e-09, + "loss": 0.4851, + "mean_token_accuracy": 0.8421405225992202, + "step": 3030 + }, + { + "epoch": 2.9892949427833146, + "grad_norm": 0.19950829446315765, + "learning_rate": 6.573003542276191e-10, + "loss": 0.4889, + "mean_token_accuracy": 0.8408236041665077, + "step": 3035 + }, + { + "epoch": 2.9942168081702967, + "grad_norm": 0.19173409044742584, + "learning_rate": 1.6432643871633346e-10, + "loss": 0.4873, + "mean_token_accuracy": 0.8419449985027313, + "step": 3040 + }, + { + "epoch": 2.9991386735572783, + "grad_norm": 0.1980327069759369, + "learning_rate": 0.0, + "loss": 0.4895, + "mean_token_accuracy": 0.8409327268600464, + "step": 3045 + }, + { + "epoch": 2.9991386735572783, + "step": 3045, + "total_flos": 2550348896010240.0, + "train_loss": 0.5881131024979214, + "train_runtime": 268544.791, + "train_samples_per_second": 1.452, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 5, + "max_steps": 3045, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2550348896010240.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}