{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999859481486686, "eval_steps": 100, "global_step": 3558, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 12.4375, "learning_rate": 1.4073580663293987e-05, "loss": 0.6629, "step": 10 }, { "epoch": 0.01, "grad_norm": 8.125, "learning_rate": 1.4047161326587971e-05, "loss": 0.6735, "step": 20 }, { "epoch": 0.02, "grad_norm": 9.5625, "learning_rate": 1.4020741989881957e-05, "loss": 0.6154, "step": 30 }, { "epoch": 0.02, "grad_norm": 11.25, "learning_rate": 1.3994322653175943e-05, "loss": 0.5955, "step": 40 }, { "epoch": 0.03, "grad_norm": 8.8125, "learning_rate": 1.3967903316469927e-05, "loss": 0.6217, "step": 50 }, { "epoch": 0.03, "grad_norm": 9.125, "learning_rate": 1.3941483979763912e-05, "loss": 0.6072, "step": 60 }, { "epoch": 0.04, "grad_norm": 10.1875, "learning_rate": 1.3915064643057898e-05, "loss": 0.5763, "step": 70 }, { "epoch": 0.04, "grad_norm": 10.5, "learning_rate": 1.3888645306351884e-05, "loss": 0.5889, "step": 80 }, { "epoch": 0.05, "grad_norm": 14.0625, "learning_rate": 1.3862225969645868e-05, "loss": 0.6259, "step": 90 }, { "epoch": 0.06, "grad_norm": 9.25, "learning_rate": 1.3835806632939854e-05, "loss": 0.5735, "step": 100 }, { "epoch": 0.06, "eval_accuracy": 0.6877384196185287, "eval_loss": 0.5604358315467834, "eval_runtime": 141.9655, "eval_samples_per_second": 12.926, "eval_steps_per_second": 1.62, "step": 100 }, { "epoch": 0.06, "grad_norm": 18.5, "learning_rate": 1.380938729623384e-05, "loss": 0.5464, "step": 110 }, { "epoch": 0.07, "grad_norm": 18.75, "learning_rate": 1.3782967959527825e-05, "loss": 0.5596, "step": 120 }, { "epoch": 0.07, "grad_norm": 9.125, "learning_rate": 1.375654862282181e-05, "loss": 0.5753, "step": 130 }, { "epoch": 0.08, "grad_norm": 9.1875, "learning_rate": 1.3730129286115797e-05, "loss": 0.5853, "step": 140 }, { "epoch": 0.08, "grad_norm": 9.625, "learning_rate": 1.3703709949409781e-05, "loss": 0.5425, "step": 150 }, { "epoch": 0.09, "grad_norm": 9.5, "learning_rate": 1.3677290612703766e-05, "loss": 0.5756, "step": 160 }, { "epoch": 0.1, "grad_norm": 10.4375, "learning_rate": 1.3650871275997752e-05, "loss": 0.5649, "step": 170 }, { "epoch": 0.1, "grad_norm": 7.4375, "learning_rate": 1.3624451939291738e-05, "loss": 0.6037, "step": 180 }, { "epoch": 0.11, "grad_norm": 9.9375, "learning_rate": 1.3598032602585722e-05, "loss": 0.5602, "step": 190 }, { "epoch": 0.11, "grad_norm": 9.6875, "learning_rate": 1.3571613265879708e-05, "loss": 0.5645, "step": 200 }, { "epoch": 0.11, "eval_accuracy": 0.7264305177111716, "eval_loss": 0.5389161705970764, "eval_runtime": 141.9974, "eval_samples_per_second": 12.923, "eval_steps_per_second": 1.62, "step": 200 }, { "epoch": 0.12, "grad_norm": 9.625, "learning_rate": 1.3545193929173694e-05, "loss": 0.5664, "step": 210 }, { "epoch": 0.12, "grad_norm": 10.0, "learning_rate": 1.3518774592467678e-05, "loss": 0.5098, "step": 220 }, { "epoch": 0.13, "grad_norm": 9.0, "learning_rate": 1.3492355255761665e-05, "loss": 0.4876, "step": 230 }, { "epoch": 0.13, "grad_norm": 12.9375, "learning_rate": 1.346593591905565e-05, "loss": 0.5043, "step": 240 }, { "epoch": 0.14, "grad_norm": 13.0, "learning_rate": 1.3439516582349635e-05, "loss": 0.5291, "step": 250 }, { "epoch": 0.15, "grad_norm": 6.75, "learning_rate": 1.341309724564362e-05, "loss": 0.5564, "step": 260 }, { "epoch": 0.15, "grad_norm": 12.625, "learning_rate": 1.3386677908937605e-05, "loss": 0.5807, "step": 270 }, { "epoch": 0.16, "grad_norm": 8.5625, "learning_rate": 1.3360258572231591e-05, "loss": 0.5704, "step": 280 }, { "epoch": 0.16, "grad_norm": 9.9375, "learning_rate": 1.3333839235525576e-05, "loss": 0.5086, "step": 290 }, { "epoch": 0.17, "grad_norm": 8.625, "learning_rate": 1.3307419898819562e-05, "loss": 0.5365, "step": 300 }, { "epoch": 0.17, "eval_accuracy": 0.7395095367847412, "eval_loss": 0.5187998414039612, "eval_runtime": 142.1499, "eval_samples_per_second": 12.909, "eval_steps_per_second": 1.618, "step": 300 }, { "epoch": 0.17, "grad_norm": 10.25, "learning_rate": 1.3281000562113548e-05, "loss": 0.5152, "step": 310 }, { "epoch": 0.18, "grad_norm": 8.25, "learning_rate": 1.3254581225407534e-05, "loss": 0.5082, "step": 320 }, { "epoch": 0.19, "grad_norm": 11.8125, "learning_rate": 1.3228161888701518e-05, "loss": 0.5098, "step": 330 }, { "epoch": 0.19, "grad_norm": 7.34375, "learning_rate": 1.3201742551995504e-05, "loss": 0.452, "step": 340 }, { "epoch": 0.2, "grad_norm": 9.375, "learning_rate": 1.317532321528949e-05, "loss": 0.5113, "step": 350 }, { "epoch": 0.2, "grad_norm": 9.125, "learning_rate": 1.3148903878583473e-05, "loss": 0.5321, "step": 360 }, { "epoch": 0.21, "grad_norm": 8.125, "learning_rate": 1.3122484541877459e-05, "loss": 0.4968, "step": 370 }, { "epoch": 0.21, "grad_norm": 10.1875, "learning_rate": 1.3096065205171445e-05, "loss": 0.491, "step": 380 }, { "epoch": 0.22, "grad_norm": 7.4375, "learning_rate": 1.306964586846543e-05, "loss": 0.583, "step": 390 }, { "epoch": 0.22, "grad_norm": 8.8125, "learning_rate": 1.3043226531759416e-05, "loss": 0.514, "step": 400 }, { "epoch": 0.22, "eval_accuracy": 0.7482288828337874, "eval_loss": 0.5034126043319702, "eval_runtime": 142.1556, "eval_samples_per_second": 12.908, "eval_steps_per_second": 1.618, "step": 400 }, { "epoch": 0.23, "grad_norm": 12.375, "learning_rate": 1.3016807195053402e-05, "loss": 0.5058, "step": 410 }, { "epoch": 0.24, "grad_norm": 7.3125, "learning_rate": 1.2990387858347388e-05, "loss": 0.4892, "step": 420 }, { "epoch": 0.24, "grad_norm": 9.25, "learning_rate": 1.2963968521641372e-05, "loss": 0.4949, "step": 430 }, { "epoch": 0.25, "grad_norm": 9.1875, "learning_rate": 1.2937549184935358e-05, "loss": 0.4951, "step": 440 }, { "epoch": 0.25, "grad_norm": 11.3125, "learning_rate": 1.2911129848229344e-05, "loss": 0.5073, "step": 450 }, { "epoch": 0.26, "grad_norm": 9.5625, "learning_rate": 1.2884710511523327e-05, "loss": 0.5347, "step": 460 }, { "epoch": 0.26, "grad_norm": 8.8125, "learning_rate": 1.2858291174817313e-05, "loss": 0.5141, "step": 470 }, { "epoch": 0.27, "grad_norm": 11.25, "learning_rate": 1.2831871838111299e-05, "loss": 0.4989, "step": 480 }, { "epoch": 0.28, "grad_norm": 8.375, "learning_rate": 1.2805452501405283e-05, "loss": 0.4907, "step": 490 }, { "epoch": 0.28, "grad_norm": 10.5, "learning_rate": 1.277903316469927e-05, "loss": 0.5059, "step": 500 }, { "epoch": 0.28, "eval_accuracy": 0.750408719346049, "eval_loss": 0.49664556980133057, "eval_runtime": 142.216, "eval_samples_per_second": 12.903, "eval_steps_per_second": 1.617, "step": 500 }, { "epoch": 0.29, "grad_norm": 7.28125, "learning_rate": 1.2752613827993255e-05, "loss": 0.4931, "step": 510 }, { "epoch": 0.29, "grad_norm": 7.1875, "learning_rate": 1.2726194491287241e-05, "loss": 0.5002, "step": 520 }, { "epoch": 0.3, "grad_norm": 18.5, "learning_rate": 1.2699775154581226e-05, "loss": 0.5315, "step": 530 }, { "epoch": 0.3, "grad_norm": 8.0, "learning_rate": 1.2673355817875212e-05, "loss": 0.4859, "step": 540 }, { "epoch": 0.31, "grad_norm": 7.9375, "learning_rate": 1.2646936481169198e-05, "loss": 0.5106, "step": 550 }, { "epoch": 0.31, "grad_norm": 8.1875, "learning_rate": 1.262051714446318e-05, "loss": 0.4945, "step": 560 }, { "epoch": 0.32, "grad_norm": 10.25, "learning_rate": 1.2594097807757167e-05, "loss": 0.4444, "step": 570 }, { "epoch": 0.33, "grad_norm": 12.1875, "learning_rate": 1.2567678471051153e-05, "loss": 0.5583, "step": 580 }, { "epoch": 0.33, "grad_norm": 7.28125, "learning_rate": 1.2541259134345139e-05, "loss": 0.5095, "step": 590 }, { "epoch": 0.34, "grad_norm": 7.71875, "learning_rate": 1.2514839797639123e-05, "loss": 0.5069, "step": 600 }, { "epoch": 0.34, "eval_accuracy": 0.755858310626703, "eval_loss": 0.49528396129608154, "eval_runtime": 142.1925, "eval_samples_per_second": 12.905, "eval_steps_per_second": 1.618, "step": 600 }, { "epoch": 0.34, "grad_norm": 9.4375, "learning_rate": 1.2488420460933109e-05, "loss": 0.4984, "step": 610 }, { "epoch": 0.35, "grad_norm": 9.0, "learning_rate": 1.2462001124227095e-05, "loss": 0.5387, "step": 620 }, { "epoch": 0.35, "grad_norm": 6.53125, "learning_rate": 1.243558178752108e-05, "loss": 0.5113, "step": 630 }, { "epoch": 0.36, "grad_norm": 7.5625, "learning_rate": 1.2409162450815066e-05, "loss": 0.4985, "step": 640 }, { "epoch": 0.37, "grad_norm": 7.875, "learning_rate": 1.2382743114109052e-05, "loss": 0.4794, "step": 650 }, { "epoch": 0.37, "grad_norm": 15.6875, "learning_rate": 1.2356323777403036e-05, "loss": 0.4693, "step": 660 }, { "epoch": 0.38, "grad_norm": 7.21875, "learning_rate": 1.232990444069702e-05, "loss": 0.4933, "step": 670 }, { "epoch": 0.38, "grad_norm": 10.0625, "learning_rate": 1.2303485103991006e-05, "loss": 0.5151, "step": 680 }, { "epoch": 0.39, "grad_norm": 9.3125, "learning_rate": 1.2277065767284992e-05, "loss": 0.4777, "step": 690 }, { "epoch": 0.39, "grad_norm": 8.3125, "learning_rate": 1.2250646430578977e-05, "loss": 0.4563, "step": 700 }, { "epoch": 0.39, "eval_accuracy": 0.7525885558583106, "eval_loss": 0.4931628704071045, "eval_runtime": 142.3987, "eval_samples_per_second": 12.886, "eval_steps_per_second": 1.615, "step": 700 }, { "epoch": 0.4, "grad_norm": 11.8125, "learning_rate": 1.2224227093872963e-05, "loss": 0.4919, "step": 710 }, { "epoch": 0.4, "grad_norm": 7.03125, "learning_rate": 1.2197807757166949e-05, "loss": 0.5312, "step": 720 }, { "epoch": 0.41, "grad_norm": 7.8125, "learning_rate": 1.2171388420460933e-05, "loss": 0.4921, "step": 730 }, { "epoch": 0.42, "grad_norm": 6.59375, "learning_rate": 1.214496908375492e-05, "loss": 0.5291, "step": 740 }, { "epoch": 0.42, "grad_norm": 8.0625, "learning_rate": 1.2118549747048905e-05, "loss": 0.5134, "step": 750 }, { "epoch": 0.43, "grad_norm": 9.0, "learning_rate": 1.209213041034289e-05, "loss": 0.5169, "step": 760 }, { "epoch": 0.43, "grad_norm": 9.875, "learning_rate": 1.2065711073636874e-05, "loss": 0.4899, "step": 770 }, { "epoch": 0.44, "grad_norm": 13.875, "learning_rate": 1.203929173693086e-05, "loss": 0.5279, "step": 780 }, { "epoch": 0.44, "grad_norm": 8.9375, "learning_rate": 1.2012872400224846e-05, "loss": 0.4735, "step": 790 }, { "epoch": 0.45, "grad_norm": 9.4375, "learning_rate": 1.198645306351883e-05, "loss": 0.4622, "step": 800 }, { "epoch": 0.45, "eval_accuracy": 0.7645776566757493, "eval_loss": 0.47542256116867065, "eval_runtime": 142.298, "eval_samples_per_second": 12.895, "eval_steps_per_second": 1.616, "step": 800 }, { "epoch": 0.46, "grad_norm": 10.25, "learning_rate": 1.1960033726812817e-05, "loss": 0.4805, "step": 810 }, { "epoch": 0.46, "grad_norm": 8.875, "learning_rate": 1.1933614390106803e-05, "loss": 0.4524, "step": 820 }, { "epoch": 0.47, "grad_norm": 9.625, "learning_rate": 1.1907195053400787e-05, "loss": 0.4611, "step": 830 }, { "epoch": 0.47, "grad_norm": 10.5, "learning_rate": 1.1880775716694773e-05, "loss": 0.4916, "step": 840 }, { "epoch": 0.48, "grad_norm": 11.25, "learning_rate": 1.1854356379988759e-05, "loss": 0.485, "step": 850 }, { "epoch": 0.48, "grad_norm": 10.625, "learning_rate": 1.1827937043282744e-05, "loss": 0.4987, "step": 860 }, { "epoch": 0.49, "grad_norm": 10.3125, "learning_rate": 1.1801517706576728e-05, "loss": 0.5131, "step": 870 }, { "epoch": 0.49, "grad_norm": 7.53125, "learning_rate": 1.1775098369870714e-05, "loss": 0.536, "step": 880 }, { "epoch": 0.5, "grad_norm": 7.8125, "learning_rate": 1.17486790331647e-05, "loss": 0.4573, "step": 890 }, { "epoch": 0.51, "grad_norm": 10.5625, "learning_rate": 1.1722259696458684e-05, "loss": 0.4652, "step": 900 }, { "epoch": 0.51, "eval_accuracy": 0.750408719346049, "eval_loss": 0.4861762225627899, "eval_runtime": 142.333, "eval_samples_per_second": 12.892, "eval_steps_per_second": 1.616, "step": 900 }, { "epoch": 0.51, "grad_norm": 7.0625, "learning_rate": 1.169584035975267e-05, "loss": 0.4585, "step": 910 }, { "epoch": 0.52, "grad_norm": 9.0625, "learning_rate": 1.1669421023046656e-05, "loss": 0.4912, "step": 920 }, { "epoch": 0.52, "grad_norm": 7.09375, "learning_rate": 1.164300168634064e-05, "loss": 0.4856, "step": 930 }, { "epoch": 0.53, "grad_norm": 12.75, "learning_rate": 1.1616582349634627e-05, "loss": 0.5451, "step": 940 }, { "epoch": 0.53, "grad_norm": 9.625, "learning_rate": 1.1590163012928613e-05, "loss": 0.4591, "step": 950 }, { "epoch": 0.54, "grad_norm": 11.1875, "learning_rate": 1.1563743676222597e-05, "loss": 0.46, "step": 960 }, { "epoch": 0.55, "grad_norm": 8.875, "learning_rate": 1.1537324339516582e-05, "loss": 0.5013, "step": 970 }, { "epoch": 0.55, "grad_norm": 11.6875, "learning_rate": 1.1510905002810568e-05, "loss": 0.4567, "step": 980 }, { "epoch": 0.56, "grad_norm": 13.0625, "learning_rate": 1.1484485666104554e-05, "loss": 0.4723, "step": 990 }, { "epoch": 0.56, "grad_norm": 8.8125, "learning_rate": 1.1458066329398538e-05, "loss": 0.475, "step": 1000 }, { "epoch": 0.56, "eval_accuracy": 0.75858310626703, "eval_loss": 0.4794474244117737, "eval_runtime": 142.239, "eval_samples_per_second": 12.901, "eval_steps_per_second": 1.617, "step": 1000 }, { "epoch": 0.57, "grad_norm": 9.1875, "learning_rate": 1.1431646992692524e-05, "loss": 0.4791, "step": 1010 }, { "epoch": 0.57, "grad_norm": 7.6875, "learning_rate": 1.140522765598651e-05, "loss": 0.4898, "step": 1020 }, { "epoch": 0.58, "grad_norm": 11.0, "learning_rate": 1.1378808319280496e-05, "loss": 0.4457, "step": 1030 }, { "epoch": 0.58, "grad_norm": 9.75, "learning_rate": 1.135238898257448e-05, "loss": 0.4652, "step": 1040 }, { "epoch": 0.59, "grad_norm": 8.625, "learning_rate": 1.1325969645868467e-05, "loss": 0.4492, "step": 1050 }, { "epoch": 0.6, "grad_norm": 12.6875, "learning_rate": 1.1299550309162451e-05, "loss": 0.4723, "step": 1060 }, { "epoch": 0.6, "grad_norm": 9.0625, "learning_rate": 1.1273130972456435e-05, "loss": 0.4102, "step": 1070 }, { "epoch": 0.61, "grad_norm": 11.5625, "learning_rate": 1.1246711635750421e-05, "loss": 0.5307, "step": 1080 }, { "epoch": 0.61, "grad_norm": 10.0625, "learning_rate": 1.1220292299044407e-05, "loss": 0.5015, "step": 1090 }, { "epoch": 0.62, "grad_norm": 10.5, "learning_rate": 1.1193872962338392e-05, "loss": 0.4674, "step": 1100 }, { "epoch": 0.62, "eval_accuracy": 0.7651226158038147, "eval_loss": 0.4715699851512909, "eval_runtime": 142.1959, "eval_samples_per_second": 12.905, "eval_steps_per_second": 1.617, "step": 1100 }, { "epoch": 0.62, "grad_norm": 11.125, "learning_rate": 1.1167453625632378e-05, "loss": 0.5204, "step": 1110 }, { "epoch": 0.63, "grad_norm": 8.625, "learning_rate": 1.1141034288926364e-05, "loss": 0.5308, "step": 1120 }, { "epoch": 0.64, "grad_norm": 10.9375, "learning_rate": 1.111461495222035e-05, "loss": 0.4792, "step": 1130 }, { "epoch": 0.64, "grad_norm": 8.625, "learning_rate": 1.1088195615514334e-05, "loss": 0.4665, "step": 1140 }, { "epoch": 0.65, "grad_norm": 12.25, "learning_rate": 1.106177627880832e-05, "loss": 0.547, "step": 1150 }, { "epoch": 0.65, "grad_norm": 7.875, "learning_rate": 1.1035356942102305e-05, "loss": 0.5029, "step": 1160 }, { "epoch": 0.66, "grad_norm": 7.4375, "learning_rate": 1.1008937605396289e-05, "loss": 0.5048, "step": 1170 }, { "epoch": 0.66, "grad_norm": 10.3125, "learning_rate": 1.0982518268690275e-05, "loss": 0.4914, "step": 1180 }, { "epoch": 0.67, "grad_norm": 8.125, "learning_rate": 1.0956098931984261e-05, "loss": 0.4516, "step": 1190 }, { "epoch": 0.67, "grad_norm": 10.0625, "learning_rate": 1.0929679595278246e-05, "loss": 0.4795, "step": 1200 }, { "epoch": 0.67, "eval_accuracy": 0.7640326975476839, "eval_loss": 0.4730105400085449, "eval_runtime": 142.3555, "eval_samples_per_second": 12.89, "eval_steps_per_second": 1.616, "step": 1200 }, { "epoch": 0.68, "grad_norm": 11.1875, "learning_rate": 1.0903260258572232e-05, "loss": 0.4903, "step": 1210 }, { "epoch": 0.69, "grad_norm": 8.9375, "learning_rate": 1.0876840921866218e-05, "loss": 0.5608, "step": 1220 }, { "epoch": 0.69, "grad_norm": 10.4375, "learning_rate": 1.0850421585160204e-05, "loss": 0.5333, "step": 1230 }, { "epoch": 0.7, "grad_norm": 7.84375, "learning_rate": 1.0824002248454188e-05, "loss": 0.4759, "step": 1240 }, { "epoch": 0.7, "grad_norm": 9.0625, "learning_rate": 1.0797582911748174e-05, "loss": 0.4687, "step": 1250 }, { "epoch": 0.71, "grad_norm": 8.75, "learning_rate": 1.077116357504216e-05, "loss": 0.4447, "step": 1260 }, { "epoch": 0.71, "grad_norm": 9.625, "learning_rate": 1.0744744238336143e-05, "loss": 0.4866, "step": 1270 }, { "epoch": 0.72, "grad_norm": 10.8125, "learning_rate": 1.0718324901630129e-05, "loss": 0.4746, "step": 1280 }, { "epoch": 0.73, "grad_norm": 8.625, "learning_rate": 1.0691905564924115e-05, "loss": 0.4857, "step": 1290 }, { "epoch": 0.73, "grad_norm": 9.0625, "learning_rate": 1.0665486228218101e-05, "loss": 0.4863, "step": 1300 }, { "epoch": 0.73, "eval_accuracy": 0.7634877384196185, "eval_loss": 0.47181499004364014, "eval_runtime": 142.2836, "eval_samples_per_second": 12.897, "eval_steps_per_second": 1.616, "step": 1300 }, { "epoch": 0.74, "grad_norm": 13.375, "learning_rate": 1.0639066891512085e-05, "loss": 0.4763, "step": 1310 }, { "epoch": 0.74, "grad_norm": 8.4375, "learning_rate": 1.0612647554806071e-05, "loss": 0.4709, "step": 1320 }, { "epoch": 0.75, "grad_norm": 8.8125, "learning_rate": 1.0586228218100058e-05, "loss": 0.4646, "step": 1330 }, { "epoch": 0.75, "grad_norm": 8.75, "learning_rate": 1.0559808881394042e-05, "loss": 0.4678, "step": 1340 }, { "epoch": 0.76, "grad_norm": 12.375, "learning_rate": 1.0533389544688028e-05, "loss": 0.501, "step": 1350 }, { "epoch": 0.76, "grad_norm": 8.3125, "learning_rate": 1.0506970207982014e-05, "loss": 0.4703, "step": 1360 }, { "epoch": 0.77, "grad_norm": 9.0, "learning_rate": 1.0480550871275997e-05, "loss": 0.4376, "step": 1370 }, { "epoch": 0.78, "grad_norm": 10.375, "learning_rate": 1.0454131534569983e-05, "loss": 0.4358, "step": 1380 }, { "epoch": 0.78, "grad_norm": 10.25, "learning_rate": 1.0427712197863969e-05, "loss": 0.507, "step": 1390 }, { "epoch": 0.79, "grad_norm": 9.0625, "learning_rate": 1.0401292861157955e-05, "loss": 0.4349, "step": 1400 }, { "epoch": 0.79, "eval_accuracy": 0.7640326975476839, "eval_loss": 0.4669208228588104, "eval_runtime": 142.225, "eval_samples_per_second": 12.902, "eval_steps_per_second": 1.617, "step": 1400 }, { "epoch": 0.79, "grad_norm": 7.46875, "learning_rate": 1.037487352445194e-05, "loss": 0.4818, "step": 1410 }, { "epoch": 0.8, "grad_norm": 16.0, "learning_rate": 1.0348454187745925e-05, "loss": 0.4945, "step": 1420 }, { "epoch": 0.8, "grad_norm": 11.0, "learning_rate": 1.0322034851039911e-05, "loss": 0.4396, "step": 1430 }, { "epoch": 0.81, "grad_norm": 12.1875, "learning_rate": 1.0295615514333896e-05, "loss": 0.4604, "step": 1440 }, { "epoch": 0.82, "grad_norm": 11.25, "learning_rate": 1.0269196177627882e-05, "loss": 0.4869, "step": 1450 }, { "epoch": 0.82, "grad_norm": 9.1875, "learning_rate": 1.0242776840921868e-05, "loss": 0.4951, "step": 1460 }, { "epoch": 0.83, "grad_norm": 8.625, "learning_rate": 1.021635750421585e-05, "loss": 0.4856, "step": 1470 }, { "epoch": 0.83, "grad_norm": 8.1875, "learning_rate": 1.0189938167509836e-05, "loss": 0.4302, "step": 1480 }, { "epoch": 0.84, "grad_norm": 10.6875, "learning_rate": 1.0163518830803823e-05, "loss": 0.4723, "step": 1490 }, { "epoch": 0.84, "grad_norm": 12.4375, "learning_rate": 1.0137099494097809e-05, "loss": 0.5332, "step": 1500 }, { "epoch": 0.84, "eval_accuracy": 0.7683923705722071, "eval_loss": 0.47141385078430176, "eval_runtime": 142.2697, "eval_samples_per_second": 12.898, "eval_steps_per_second": 1.617, "step": 1500 }, { "epoch": 0.85, "grad_norm": 9.4375, "learning_rate": 1.0110680157391793e-05, "loss": 0.4281, "step": 1510 }, { "epoch": 0.85, "grad_norm": 9.125, "learning_rate": 1.0084260820685779e-05, "loss": 0.4719, "step": 1520 }, { "epoch": 0.86, "grad_norm": 8.375, "learning_rate": 1.0057841483979765e-05, "loss": 0.4702, "step": 1530 }, { "epoch": 0.87, "grad_norm": 10.6875, "learning_rate": 1.003142214727375e-05, "loss": 0.4523, "step": 1540 }, { "epoch": 0.87, "grad_norm": 10.4375, "learning_rate": 1.0005002810567735e-05, "loss": 0.4807, "step": 1550 }, { "epoch": 0.88, "grad_norm": 8.9375, "learning_rate": 9.978583473861721e-06, "loss": 0.4813, "step": 1560 }, { "epoch": 0.88, "grad_norm": 9.0625, "learning_rate": 9.952164137155706e-06, "loss": 0.4687, "step": 1570 }, { "epoch": 0.89, "grad_norm": 8.375, "learning_rate": 9.92574480044969e-06, "loss": 0.4693, "step": 1580 }, { "epoch": 0.89, "grad_norm": 7.875, "learning_rate": 9.899325463743676e-06, "loss": 0.4809, "step": 1590 }, { "epoch": 0.9, "grad_norm": 9.3125, "learning_rate": 9.872906127037662e-06, "loss": 0.4638, "step": 1600 }, { "epoch": 0.9, "eval_accuracy": 0.7673024523160763, "eval_loss": 0.4669085741043091, "eval_runtime": 142.321, "eval_samples_per_second": 12.893, "eval_steps_per_second": 1.616, "step": 1600 }, { "epoch": 0.9, "grad_norm": 11.0, "learning_rate": 9.846486790331647e-06, "loss": 0.495, "step": 1610 }, { "epoch": 0.91, "grad_norm": 7.3125, "learning_rate": 9.820067453625633e-06, "loss": 0.4823, "step": 1620 }, { "epoch": 0.92, "grad_norm": 9.1875, "learning_rate": 9.793648116919619e-06, "loss": 0.4845, "step": 1630 }, { "epoch": 0.92, "grad_norm": 11.25, "learning_rate": 9.767228780213603e-06, "loss": 0.4868, "step": 1640 }, { "epoch": 0.93, "grad_norm": 9.5625, "learning_rate": 9.74080944350759e-06, "loss": 0.4499, "step": 1650 }, { "epoch": 0.93, "grad_norm": 8.25, "learning_rate": 9.714390106801575e-06, "loss": 0.4736, "step": 1660 }, { "epoch": 0.94, "grad_norm": 11.1875, "learning_rate": 9.68797077009556e-06, "loss": 0.4076, "step": 1670 }, { "epoch": 0.94, "grad_norm": 9.5625, "learning_rate": 9.661551433389544e-06, "loss": 0.4304, "step": 1680 }, { "epoch": 0.95, "grad_norm": 10.875, "learning_rate": 9.63513209668353e-06, "loss": 0.4477, "step": 1690 }, { "epoch": 0.96, "grad_norm": 11.125, "learning_rate": 9.608712759977516e-06, "loss": 0.4751, "step": 1700 }, { "epoch": 0.96, "eval_accuracy": 0.7732970027247956, "eval_loss": 0.4651176333427429, "eval_runtime": 142.2902, "eval_samples_per_second": 12.896, "eval_steps_per_second": 1.616, "step": 1700 }, { "epoch": 0.96, "grad_norm": 7.65625, "learning_rate": 9.5822934232715e-06, "loss": 0.4784, "step": 1710 }, { "epoch": 0.97, "grad_norm": 8.6875, "learning_rate": 9.555874086565486e-06, "loss": 0.5156, "step": 1720 }, { "epoch": 0.97, "grad_norm": 9.75, "learning_rate": 9.529454749859473e-06, "loss": 0.4737, "step": 1730 }, { "epoch": 0.98, "grad_norm": 10.8125, "learning_rate": 9.503035413153457e-06, "loss": 0.4564, "step": 1740 }, { "epoch": 0.98, "grad_norm": 9.4375, "learning_rate": 9.476616076447443e-06, "loss": 0.4341, "step": 1750 }, { "epoch": 0.99, "grad_norm": 9.875, "learning_rate": 9.450196739741429e-06, "loss": 0.4283, "step": 1760 }, { "epoch": 0.99, "grad_norm": 11.3125, "learning_rate": 9.423777403035413e-06, "loss": 0.4569, "step": 1770 }, { "epoch": 1.0, "grad_norm": 11.5625, "learning_rate": 9.397358066329398e-06, "loss": 0.4273, "step": 1780 }, { "epoch": 1.01, "grad_norm": 8.5625, "learning_rate": 9.370938729623384e-06, "loss": 0.4086, "step": 1790 }, { "epoch": 1.01, "grad_norm": 13.75, "learning_rate": 9.34451939291737e-06, "loss": 0.3797, "step": 1800 }, { "epoch": 1.01, "eval_accuracy": 0.7602179836512262, "eval_loss": 0.47710901498794556, "eval_runtime": 142.1748, "eval_samples_per_second": 12.907, "eval_steps_per_second": 1.618, "step": 1800 }, { "epoch": 1.02, "grad_norm": 15.375, "learning_rate": 9.318100056211354e-06, "loss": 0.3601, "step": 1810 }, { "epoch": 1.02, "grad_norm": 9.0, "learning_rate": 9.29168071950534e-06, "loss": 0.3994, "step": 1820 }, { "epoch": 1.03, "grad_norm": 14.4375, "learning_rate": 9.265261382799326e-06, "loss": 0.4058, "step": 1830 }, { "epoch": 1.03, "grad_norm": 7.625, "learning_rate": 9.238842046093312e-06, "loss": 0.32, "step": 1840 }, { "epoch": 1.04, "grad_norm": 13.5, "learning_rate": 9.212422709387297e-06, "loss": 0.3653, "step": 1850 }, { "epoch": 1.05, "grad_norm": 9.25, "learning_rate": 9.186003372681283e-06, "loss": 0.3894, "step": 1860 }, { "epoch": 1.05, "grad_norm": 8.8125, "learning_rate": 9.159584035975267e-06, "loss": 0.3429, "step": 1870 }, { "epoch": 1.06, "grad_norm": 11.375, "learning_rate": 9.133164699269251e-06, "loss": 0.393, "step": 1880 }, { "epoch": 1.06, "grad_norm": 11.9375, "learning_rate": 9.106745362563238e-06, "loss": 0.3478, "step": 1890 }, { "epoch": 1.07, "grad_norm": 12.5, "learning_rate": 9.080326025857224e-06, "loss": 0.3927, "step": 1900 }, { "epoch": 1.07, "eval_accuracy": 0.7591280653950954, "eval_loss": 0.4923277497291565, "eval_runtime": 142.0571, "eval_samples_per_second": 12.917, "eval_steps_per_second": 1.619, "step": 1900 }, { "epoch": 1.07, "grad_norm": 11.875, "learning_rate": 9.053906689151208e-06, "loss": 0.4476, "step": 1910 }, { "epoch": 1.08, "grad_norm": 12.0625, "learning_rate": 9.027487352445194e-06, "loss": 0.3207, "step": 1920 }, { "epoch": 1.08, "grad_norm": 12.3125, "learning_rate": 9.00106801573918e-06, "loss": 0.3907, "step": 1930 }, { "epoch": 1.09, "grad_norm": 10.4375, "learning_rate": 8.974648679033166e-06, "loss": 0.3504, "step": 1940 }, { "epoch": 1.1, "grad_norm": 14.0625, "learning_rate": 8.94822934232715e-06, "loss": 0.3791, "step": 1950 }, { "epoch": 1.1, "grad_norm": 12.8125, "learning_rate": 8.921810005621137e-06, "loss": 0.4153, "step": 1960 }, { "epoch": 1.11, "grad_norm": 13.1875, "learning_rate": 8.895390668915121e-06, "loss": 0.3353, "step": 1970 }, { "epoch": 1.11, "grad_norm": 18.0, "learning_rate": 8.868971332209105e-06, "loss": 0.3413, "step": 1980 }, { "epoch": 1.12, "grad_norm": 11.4375, "learning_rate": 8.842551995503091e-06, "loss": 0.3652, "step": 1990 }, { "epoch": 1.12, "grad_norm": 17.0, "learning_rate": 8.816132658797077e-06, "loss": 0.3466, "step": 2000 }, { "epoch": 1.12, "eval_accuracy": 0.7634877384196185, "eval_loss": 0.5079097151756287, "eval_runtime": 142.0481, "eval_samples_per_second": 12.918, "eval_steps_per_second": 1.619, "step": 2000 }, { "epoch": 1.13, "grad_norm": 15.625, "learning_rate": 8.789713322091062e-06, "loss": 0.417, "step": 2010 }, { "epoch": 1.14, "grad_norm": 12.4375, "learning_rate": 8.763293985385048e-06, "loss": 0.3978, "step": 2020 }, { "epoch": 1.14, "grad_norm": 10.0, "learning_rate": 8.736874648679034e-06, "loss": 0.3681, "step": 2030 }, { "epoch": 1.15, "grad_norm": 15.0625, "learning_rate": 8.71045531197302e-06, "loss": 0.3526, "step": 2040 }, { "epoch": 1.15, "grad_norm": 14.5, "learning_rate": 8.684035975267004e-06, "loss": 0.347, "step": 2050 }, { "epoch": 1.16, "grad_norm": 18.5, "learning_rate": 8.65761663856099e-06, "loss": 0.3665, "step": 2060 }, { "epoch": 1.16, "grad_norm": 17.25, "learning_rate": 8.631197301854975e-06, "loss": 0.4563, "step": 2070 }, { "epoch": 1.17, "grad_norm": 10.8125, "learning_rate": 8.604777965148959e-06, "loss": 0.3535, "step": 2080 }, { "epoch": 1.17, "grad_norm": 19.375, "learning_rate": 8.578358628442945e-06, "loss": 0.3974, "step": 2090 }, { "epoch": 1.18, "grad_norm": 14.75, "learning_rate": 8.551939291736931e-06, "loss": 0.3797, "step": 2100 }, { "epoch": 1.18, "eval_accuracy": 0.7574931880108992, "eval_loss": 0.4956045150756836, "eval_runtime": 142.0979, "eval_samples_per_second": 12.914, "eval_steps_per_second": 1.619, "step": 2100 }, { "epoch": 1.19, "grad_norm": 14.75, "learning_rate": 8.525519955030917e-06, "loss": 0.3321, "step": 2110 }, { "epoch": 1.19, "grad_norm": 15.0625, "learning_rate": 8.499100618324901e-06, "loss": 0.3536, "step": 2120 }, { "epoch": 1.2, "grad_norm": 13.1875, "learning_rate": 8.472681281618888e-06, "loss": 0.3672, "step": 2130 }, { "epoch": 1.2, "grad_norm": 17.0, "learning_rate": 8.446261944912874e-06, "loss": 0.3931, "step": 2140 }, { "epoch": 1.21, "grad_norm": 13.0625, "learning_rate": 8.419842608206858e-06, "loss": 0.3132, "step": 2150 }, { "epoch": 1.21, "grad_norm": 14.5625, "learning_rate": 8.393423271500844e-06, "loss": 0.331, "step": 2160 }, { "epoch": 1.22, "grad_norm": 17.875, "learning_rate": 8.36700393479483e-06, "loss": 0.3682, "step": 2170 }, { "epoch": 1.23, "grad_norm": 19.875, "learning_rate": 8.340584598088813e-06, "loss": 0.354, "step": 2180 }, { "epoch": 1.23, "grad_norm": 10.1875, "learning_rate": 8.314165261382799e-06, "loss": 0.333, "step": 2190 }, { "epoch": 1.24, "grad_norm": 20.75, "learning_rate": 8.287745924676785e-06, "loss": 0.3539, "step": 2200 }, { "epoch": 1.24, "eval_accuracy": 0.753133514986376, "eval_loss": 0.5139533281326294, "eval_runtime": 142.1065, "eval_samples_per_second": 12.913, "eval_steps_per_second": 1.619, "step": 2200 }, { "epoch": 1.24, "grad_norm": 13.3125, "learning_rate": 8.261326587970771e-06, "loss": 0.3441, "step": 2210 }, { "epoch": 1.25, "grad_norm": 19.0, "learning_rate": 8.234907251264755e-06, "loss": 0.3379, "step": 2220 }, { "epoch": 1.25, "grad_norm": 13.375, "learning_rate": 8.208487914558741e-06, "loss": 0.3207, "step": 2230 }, { "epoch": 1.26, "grad_norm": 24.625, "learning_rate": 8.182068577852727e-06, "loss": 0.4002, "step": 2240 }, { "epoch": 1.26, "grad_norm": 21.75, "learning_rate": 8.155649241146712e-06, "loss": 0.3749, "step": 2250 }, { "epoch": 1.27, "grad_norm": 19.0, "learning_rate": 8.129229904440698e-06, "loss": 0.3416, "step": 2260 }, { "epoch": 1.28, "grad_norm": 24.375, "learning_rate": 8.102810567734684e-06, "loss": 0.3538, "step": 2270 }, { "epoch": 1.28, "grad_norm": 22.375, "learning_rate": 8.076391231028668e-06, "loss": 0.4265, "step": 2280 }, { "epoch": 1.29, "grad_norm": 12.3125, "learning_rate": 8.049971894322653e-06, "loss": 0.3184, "step": 2290 }, { "epoch": 1.29, "grad_norm": 15.4375, "learning_rate": 8.023552557616639e-06, "loss": 0.3375, "step": 2300 }, { "epoch": 1.29, "eval_accuracy": 0.7514986376021798, "eval_loss": 0.5048823356628418, "eval_runtime": 142.1359, "eval_samples_per_second": 12.91, "eval_steps_per_second": 1.618, "step": 2300 }, { "epoch": 1.3, "grad_norm": 15.625, "learning_rate": 7.997133220910625e-06, "loss": 0.372, "step": 2310 }, { "epoch": 1.3, "grad_norm": 11.875, "learning_rate": 7.970713884204609e-06, "loss": 0.3266, "step": 2320 }, { "epoch": 1.31, "grad_norm": 13.375, "learning_rate": 7.944294547498595e-06, "loss": 0.3955, "step": 2330 }, { "epoch": 1.32, "grad_norm": 16.25, "learning_rate": 7.917875210792581e-06, "loss": 0.4296, "step": 2340 }, { "epoch": 1.32, "grad_norm": 16.0, "learning_rate": 7.891455874086565e-06, "loss": 0.3233, "step": 2350 }, { "epoch": 1.33, "grad_norm": 17.5, "learning_rate": 7.865036537380552e-06, "loss": 0.3842, "step": 2360 }, { "epoch": 1.33, "grad_norm": 14.25, "learning_rate": 7.838617200674538e-06, "loss": 0.4168, "step": 2370 }, { "epoch": 1.34, "grad_norm": 15.875, "learning_rate": 7.812197863968522e-06, "loss": 0.3938, "step": 2380 }, { "epoch": 1.34, "grad_norm": 13.0, "learning_rate": 7.785778527262506e-06, "loss": 0.3409, "step": 2390 }, { "epoch": 1.35, "grad_norm": 15.4375, "learning_rate": 7.759359190556492e-06, "loss": 0.3516, "step": 2400 }, { "epoch": 1.35, "eval_accuracy": 0.7569482288828338, "eval_loss": 0.5032666921615601, "eval_runtime": 142.0974, "eval_samples_per_second": 12.914, "eval_steps_per_second": 1.619, "step": 2400 }, { "epoch": 1.35, "grad_norm": 15.4375, "learning_rate": 7.732939853850478e-06, "loss": 0.3461, "step": 2410 }, { "epoch": 1.36, "grad_norm": 20.875, "learning_rate": 7.706520517144463e-06, "loss": 0.3672, "step": 2420 }, { "epoch": 1.37, "grad_norm": 15.75, "learning_rate": 7.680101180438449e-06, "loss": 0.3725, "step": 2430 }, { "epoch": 1.37, "grad_norm": 14.75, "learning_rate": 7.653681843732435e-06, "loss": 0.358, "step": 2440 }, { "epoch": 1.38, "grad_norm": 15.3125, "learning_rate": 7.627262507026419e-06, "loss": 0.3741, "step": 2450 }, { "epoch": 1.38, "grad_norm": 15.125, "learning_rate": 7.6008431703204044e-06, "loss": 0.3258, "step": 2460 }, { "epoch": 1.39, "grad_norm": 20.125, "learning_rate": 7.5744238336143905e-06, "loss": 0.3794, "step": 2470 }, { "epoch": 1.39, "grad_norm": 14.125, "learning_rate": 7.5480044969083765e-06, "loss": 0.3656, "step": 2480 }, { "epoch": 1.4, "grad_norm": 18.75, "learning_rate": 7.521585160202361e-06, "loss": 0.4406, "step": 2490 }, { "epoch": 1.41, "grad_norm": 11.0, "learning_rate": 7.495165823496347e-06, "loss": 0.3656, "step": 2500 }, { "epoch": 1.41, "eval_accuracy": 0.7564032697547683, "eval_loss": 0.4987718462944031, "eval_runtime": 141.9986, "eval_samples_per_second": 12.923, "eval_steps_per_second": 1.62, "step": 2500 }, { "epoch": 1.41, "grad_norm": 20.75, "learning_rate": 7.468746486790332e-06, "loss": 0.3657, "step": 2510 }, { "epoch": 1.42, "grad_norm": 13.0, "learning_rate": 7.4423271500843165e-06, "loss": 0.3721, "step": 2520 }, { "epoch": 1.42, "grad_norm": 11.9375, "learning_rate": 7.4159078133783026e-06, "loss": 0.3064, "step": 2530 }, { "epoch": 1.43, "grad_norm": 15.0, "learning_rate": 7.389488476672289e-06, "loss": 0.3378, "step": 2540 }, { "epoch": 1.43, "grad_norm": 24.875, "learning_rate": 7.363069139966274e-06, "loss": 0.3523, "step": 2550 }, { "epoch": 1.44, "grad_norm": 13.4375, "learning_rate": 7.336649803260258e-06, "loss": 0.3767, "step": 2560 }, { "epoch": 1.44, "grad_norm": 14.3125, "learning_rate": 7.310230466554244e-06, "loss": 0.2954, "step": 2570 }, { "epoch": 1.45, "grad_norm": 20.375, "learning_rate": 7.28381112984823e-06, "loss": 0.354, "step": 2580 }, { "epoch": 1.46, "grad_norm": 14.375, "learning_rate": 7.257391793142215e-06, "loss": 0.3656, "step": 2590 }, { "epoch": 1.46, "grad_norm": 18.0, "learning_rate": 7.230972456436201e-06, "loss": 0.3736, "step": 2600 }, { "epoch": 1.46, "eval_accuracy": 0.7618528610354224, "eval_loss": 0.5070914030075073, "eval_runtime": 141.9638, "eval_samples_per_second": 12.926, "eval_steps_per_second": 1.62, "step": 2600 }, { "epoch": 1.47, "grad_norm": 9.375, "learning_rate": 7.204553119730186e-06, "loss": 0.3521, "step": 2610 }, { "epoch": 1.47, "grad_norm": 14.5, "learning_rate": 7.17813378302417e-06, "loss": 0.322, "step": 2620 }, { "epoch": 1.48, "grad_norm": 16.0, "learning_rate": 7.151714446318156e-06, "loss": 0.3927, "step": 2630 }, { "epoch": 1.48, "grad_norm": 10.1875, "learning_rate": 7.125295109612142e-06, "loss": 0.3437, "step": 2640 }, { "epoch": 1.49, "grad_norm": 15.375, "learning_rate": 7.098875772906128e-06, "loss": 0.3386, "step": 2650 }, { "epoch": 1.5, "grad_norm": 18.875, "learning_rate": 7.072456436200112e-06, "loss": 0.3854, "step": 2660 }, { "epoch": 1.5, "grad_norm": 14.4375, "learning_rate": 7.046037099494098e-06, "loss": 0.3851, "step": 2670 }, { "epoch": 1.51, "grad_norm": 14.9375, "learning_rate": 7.019617762788083e-06, "loss": 0.4368, "step": 2680 }, { "epoch": 1.51, "grad_norm": 12.6875, "learning_rate": 6.993198426082069e-06, "loss": 0.3769, "step": 2690 }, { "epoch": 1.52, "grad_norm": 16.875, "learning_rate": 6.9667790893760545e-06, "loss": 0.4186, "step": 2700 }, { "epoch": 1.52, "eval_accuracy": 0.7591280653950954, "eval_loss": 0.49152591824531555, "eval_runtime": 141.888, "eval_samples_per_second": 12.933, "eval_steps_per_second": 1.621, "step": 2700 }, { "epoch": 1.52, "grad_norm": 20.75, "learning_rate": 6.94035975267004e-06, "loss": 0.3645, "step": 2710 }, { "epoch": 1.53, "grad_norm": 13.0, "learning_rate": 6.913940415964025e-06, "loss": 0.3904, "step": 2720 }, { "epoch": 1.53, "grad_norm": 17.25, "learning_rate": 6.88752107925801e-06, "loss": 0.336, "step": 2730 }, { "epoch": 1.54, "grad_norm": 18.375, "learning_rate": 6.861101742551996e-06, "loss": 0.3957, "step": 2740 }, { "epoch": 1.55, "grad_norm": 13.625, "learning_rate": 6.834682405845981e-06, "loss": 0.3611, "step": 2750 }, { "epoch": 1.55, "grad_norm": 15.0625, "learning_rate": 6.8082630691399666e-06, "loss": 0.3618, "step": 2760 }, { "epoch": 1.56, "grad_norm": 13.75, "learning_rate": 6.781843732433952e-06, "loss": 0.4071, "step": 2770 }, { "epoch": 1.56, "grad_norm": 12.3125, "learning_rate": 6.755424395727937e-06, "loss": 0.3076, "step": 2780 }, { "epoch": 1.57, "grad_norm": 20.75, "learning_rate": 6.729005059021923e-06, "loss": 0.3781, "step": 2790 }, { "epoch": 1.57, "grad_norm": 15.625, "learning_rate": 6.702585722315908e-06, "loss": 0.359, "step": 2800 }, { "epoch": 1.57, "eval_accuracy": 0.750408719346049, "eval_loss": 0.5078296661376953, "eval_runtime": 141.7858, "eval_samples_per_second": 12.942, "eval_steps_per_second": 1.622, "step": 2800 }, { "epoch": 1.58, "grad_norm": 17.5, "learning_rate": 6.6761663856098934e-06, "loss": 0.3914, "step": 2810 }, { "epoch": 1.59, "grad_norm": 23.0, "learning_rate": 6.649747048903879e-06, "loss": 0.381, "step": 2820 }, { "epoch": 1.59, "grad_norm": 15.6875, "learning_rate": 6.623327712197864e-06, "loss": 0.368, "step": 2830 }, { "epoch": 1.6, "grad_norm": 15.0, "learning_rate": 6.59690837549185e-06, "loss": 0.3871, "step": 2840 }, { "epoch": 1.6, "grad_norm": 25.625, "learning_rate": 6.570489038785835e-06, "loss": 0.365, "step": 2850 }, { "epoch": 1.61, "grad_norm": 16.625, "learning_rate": 6.54406970207982e-06, "loss": 0.3261, "step": 2860 }, { "epoch": 1.61, "grad_norm": 19.0, "learning_rate": 6.5176503653738055e-06, "loss": 0.3631, "step": 2870 }, { "epoch": 1.62, "grad_norm": 20.75, "learning_rate": 6.491231028667791e-06, "loss": 0.4074, "step": 2880 }, { "epoch": 1.62, "grad_norm": 16.0, "learning_rate": 6.464811691961777e-06, "loss": 0.3706, "step": 2890 }, { "epoch": 1.63, "grad_norm": 13.75, "learning_rate": 6.438392355255762e-06, "loss": 0.3324, "step": 2900 }, { "epoch": 1.63, "eval_accuracy": 0.7547683923705722, "eval_loss": 0.5029261708259583, "eval_runtime": 141.8255, "eval_samples_per_second": 12.938, "eval_steps_per_second": 1.622, "step": 2900 }, { "epoch": 1.64, "grad_norm": 19.125, "learning_rate": 6.411973018549747e-06, "loss": 0.4027, "step": 2910 }, { "epoch": 1.64, "grad_norm": 22.75, "learning_rate": 6.385553681843732e-06, "loss": 0.3884, "step": 2920 }, { "epoch": 1.65, "grad_norm": 19.25, "learning_rate": 6.3591343451377184e-06, "loss": 0.3037, "step": 2930 }, { "epoch": 1.65, "grad_norm": 14.4375, "learning_rate": 6.332715008431704e-06, "loss": 0.327, "step": 2940 }, { "epoch": 1.66, "grad_norm": 15.5625, "learning_rate": 6.306295671725689e-06, "loss": 0.342, "step": 2950 }, { "epoch": 1.66, "grad_norm": 15.5, "learning_rate": 6.279876335019674e-06, "loss": 0.3936, "step": 2960 }, { "epoch": 1.67, "grad_norm": 20.5, "learning_rate": 6.253456998313659e-06, "loss": 0.4121, "step": 2970 }, { "epoch": 1.67, "grad_norm": 19.125, "learning_rate": 6.227037661607645e-06, "loss": 0.3718, "step": 2980 }, { "epoch": 1.68, "grad_norm": 13.375, "learning_rate": 6.2006183249016305e-06, "loss": 0.385, "step": 2990 }, { "epoch": 1.69, "grad_norm": 15.5, "learning_rate": 6.174198988195616e-06, "loss": 0.324, "step": 3000 }, { "epoch": 1.69, "eval_accuracy": 0.7536784741144414, "eval_loss": 0.5054526329040527, "eval_runtime": 141.8758, "eval_samples_per_second": 12.934, "eval_steps_per_second": 1.621, "step": 3000 }, { "epoch": 1.69, "grad_norm": 14.0, "learning_rate": 6.147779651489601e-06, "loss": 0.3654, "step": 3010 }, { "epoch": 1.7, "grad_norm": 15.25, "learning_rate": 6.121360314783586e-06, "loss": 0.3384, "step": 3020 }, { "epoch": 1.7, "grad_norm": 13.4375, "learning_rate": 6.094940978077572e-06, "loss": 0.3573, "step": 3030 }, { "epoch": 1.71, "grad_norm": 19.375, "learning_rate": 6.068521641371557e-06, "loss": 0.3118, "step": 3040 }, { "epoch": 1.71, "grad_norm": 12.1875, "learning_rate": 6.042102304665543e-06, "loss": 0.4102, "step": 3050 }, { "epoch": 1.72, "grad_norm": 21.0, "learning_rate": 6.015682967959528e-06, "loss": 0.3904, "step": 3060 }, { "epoch": 1.73, "grad_norm": 21.375, "learning_rate": 5.989263631253513e-06, "loss": 0.4, "step": 3070 }, { "epoch": 1.73, "grad_norm": 14.625, "learning_rate": 5.962844294547499e-06, "loss": 0.3698, "step": 3080 }, { "epoch": 1.74, "grad_norm": 15.9375, "learning_rate": 5.936424957841484e-06, "loss": 0.3566, "step": 3090 }, { "epoch": 1.74, "grad_norm": 15.375, "learning_rate": 5.9100056211354695e-06, "loss": 0.3243, "step": 3100 }, { "epoch": 1.74, "eval_accuracy": 0.7656675749318801, "eval_loss": 0.5005462765693665, "eval_runtime": 141.843, "eval_samples_per_second": 12.937, "eval_steps_per_second": 1.622, "step": 3100 }, { "epoch": 1.75, "grad_norm": 13.0625, "learning_rate": 5.883586284429455e-06, "loss": 0.3342, "step": 3110 }, { "epoch": 1.75, "grad_norm": 19.0, "learning_rate": 5.85716694772344e-06, "loss": 0.4035, "step": 3120 }, { "epoch": 1.76, "grad_norm": 15.6875, "learning_rate": 5.830747611017426e-06, "loss": 0.3301, "step": 3130 }, { "epoch": 1.76, "grad_norm": 11.6875, "learning_rate": 5.804328274311411e-06, "loss": 0.3198, "step": 3140 }, { "epoch": 1.77, "grad_norm": 16.875, "learning_rate": 5.777908937605396e-06, "loss": 0.3503, "step": 3150 }, { "epoch": 1.78, "grad_norm": 19.625, "learning_rate": 5.7514896008993816e-06, "loss": 0.3682, "step": 3160 }, { "epoch": 1.78, "grad_norm": 19.5, "learning_rate": 5.725070264193367e-06, "loss": 0.427, "step": 3170 }, { "epoch": 1.79, "grad_norm": 13.75, "learning_rate": 5.698650927487353e-06, "loss": 0.3185, "step": 3180 }, { "epoch": 1.79, "grad_norm": 18.875, "learning_rate": 5.672231590781338e-06, "loss": 0.3533, "step": 3190 }, { "epoch": 1.8, "grad_norm": 17.0, "learning_rate": 5.645812254075324e-06, "loss": 0.4431, "step": 3200 }, { "epoch": 1.8, "eval_accuracy": 0.7640326975476839, "eval_loss": 0.5007506012916565, "eval_runtime": 141.8357, "eval_samples_per_second": 12.938, "eval_steps_per_second": 1.622, "step": 3200 }, { "epoch": 1.8, "grad_norm": 16.875, "learning_rate": 5.6193929173693085e-06, "loss": 0.4189, "step": 3210 }, { "epoch": 1.81, "grad_norm": 18.25, "learning_rate": 5.592973580663294e-06, "loss": 0.3313, "step": 3220 }, { "epoch": 1.82, "grad_norm": 15.3125, "learning_rate": 5.56655424395728e-06, "loss": 0.3793, "step": 3230 }, { "epoch": 1.82, "grad_norm": 21.125, "learning_rate": 5.540134907251265e-06, "loss": 0.369, "step": 3240 }, { "epoch": 1.83, "grad_norm": 19.875, "learning_rate": 5.513715570545251e-06, "loss": 0.3599, "step": 3250 }, { "epoch": 1.83, "grad_norm": 22.25, "learning_rate": 5.487296233839235e-06, "loss": 0.3753, "step": 3260 }, { "epoch": 1.84, "grad_norm": 23.375, "learning_rate": 5.4608768971332205e-06, "loss": 0.4055, "step": 3270 }, { "epoch": 1.84, "grad_norm": 17.25, "learning_rate": 5.434457560427207e-06, "loss": 0.3238, "step": 3280 }, { "epoch": 1.85, "grad_norm": 16.875, "learning_rate": 5.408038223721192e-06, "loss": 0.3479, "step": 3290 }, { "epoch": 1.85, "grad_norm": 13.5625, "learning_rate": 5.381618887015178e-06, "loss": 0.3659, "step": 3300 }, { "epoch": 1.85, "eval_accuracy": 0.7640326975476839, "eval_loss": 0.4973560571670532, "eval_runtime": 141.8385, "eval_samples_per_second": 12.937, "eval_steps_per_second": 1.622, "step": 3300 }, { "epoch": 1.86, "grad_norm": 18.375, "learning_rate": 5.355199550309163e-06, "loss": 0.372, "step": 3310 }, { "epoch": 1.87, "grad_norm": 22.125, "learning_rate": 5.328780213603147e-06, "loss": 0.3895, "step": 3320 }, { "epoch": 1.87, "grad_norm": 18.0, "learning_rate": 5.3023608768971335e-06, "loss": 0.4096, "step": 3330 }, { "epoch": 1.88, "grad_norm": 20.5, "learning_rate": 5.275941540191119e-06, "loss": 0.3394, "step": 3340 }, { "epoch": 1.88, "grad_norm": 18.125, "learning_rate": 5.249522203485105e-06, "loss": 0.3546, "step": 3350 }, { "epoch": 1.89, "grad_norm": 18.875, "learning_rate": 5.22310286677909e-06, "loss": 0.3722, "step": 3360 }, { "epoch": 1.89, "grad_norm": 15.625, "learning_rate": 5.196683530073074e-06, "loss": 0.3795, "step": 3370 }, { "epoch": 1.9, "grad_norm": 20.75, "learning_rate": 5.17026419336706e-06, "loss": 0.3258, "step": 3380 }, { "epoch": 1.91, "grad_norm": 15.875, "learning_rate": 5.1438448566610456e-06, "loss": 0.3453, "step": 3390 }, { "epoch": 1.91, "grad_norm": 15.375, "learning_rate": 5.117425519955032e-06, "loss": 0.3166, "step": 3400 }, { "epoch": 1.91, "eval_accuracy": 0.7623978201634878, "eval_loss": 0.503051221370697, "eval_runtime": 141.8256, "eval_samples_per_second": 12.938, "eval_steps_per_second": 1.622, "step": 3400 }, { "epoch": 1.92, "grad_norm": 14.6875, "learning_rate": 5.091006183249017e-06, "loss": 0.3411, "step": 3410 }, { "epoch": 1.92, "grad_norm": 18.75, "learning_rate": 5.064586846543002e-06, "loss": 0.3287, "step": 3420 }, { "epoch": 1.93, "grad_norm": 19.125, "learning_rate": 5.038167509836987e-06, "loss": 0.3478, "step": 3430 }, { "epoch": 1.93, "grad_norm": 17.0, "learning_rate": 5.0117481731309724e-06, "loss": 0.3409, "step": 3440 }, { "epoch": 1.94, "grad_norm": 13.625, "learning_rate": 4.9853288364249585e-06, "loss": 0.318, "step": 3450 }, { "epoch": 1.94, "grad_norm": 17.625, "learning_rate": 4.958909499718944e-06, "loss": 0.3176, "step": 3460 }, { "epoch": 1.95, "grad_norm": 13.4375, "learning_rate": 4.932490163012929e-06, "loss": 0.3066, "step": 3470 }, { "epoch": 1.96, "grad_norm": 16.75, "learning_rate": 4.906070826306914e-06, "loss": 0.3458, "step": 3480 }, { "epoch": 1.96, "grad_norm": 10.125, "learning_rate": 4.879651489600899e-06, "loss": 0.3236, "step": 3490 }, { "epoch": 1.97, "grad_norm": 13.25, "learning_rate": 4.853232152894885e-06, "loss": 0.3955, "step": 3500 }, { "epoch": 1.97, "eval_accuracy": 0.7656675749318801, "eval_loss": 0.5126229524612427, "eval_runtime": 141.9165, "eval_samples_per_second": 12.93, "eval_steps_per_second": 1.621, "step": 3500 }, { "epoch": 1.97, "grad_norm": 19.875, "learning_rate": 4.8268128161888706e-06, "loss": 0.3453, "step": 3510 }, { "epoch": 1.98, "grad_norm": 16.125, "learning_rate": 4.800393479482856e-06, "loss": 0.3729, "step": 3520 }, { "epoch": 1.98, "grad_norm": 20.625, "learning_rate": 4.773974142776841e-06, "loss": 0.378, "step": 3530 }, { "epoch": 1.99, "grad_norm": 17.625, "learning_rate": 4.747554806070826e-06, "loss": 0.4079, "step": 3540 }, { "epoch": 2.0, "grad_norm": 18.25, "learning_rate": 4.721135469364812e-06, "loss": 0.3344, "step": 3550 } ], "logging_steps": 10, "max_steps": 5337, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }