{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.83629191321499, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 6.2827061767850974e-06, "loss": 3.8041, "step": 10 }, { "epoch": 0.16, "learning_rate": 9.721930086076435e-06, "loss": 3.5706, "step": 20 }, { "epoch": 0.24, "learning_rate": 1.1424336578396131e-05, "loss": 3.2293, "step": 30 }, { "epoch": 0.32, "learning_rate": 1.2565412353570195e-05, "loss": 2.8198, "step": 40 }, { "epoch": 0.39, "learning_rate": 1.3424920790724471e-05, "loss": 2.6382, "step": 50 }, { "epoch": 0.47, "learning_rate": 1.4114674634318977e-05, "loss": 2.5852, "step": 60 }, { "epoch": 0.55, "learning_rate": 1.4690794260273606e-05, "loss": 2.4718, "step": 70 }, { "epoch": 0.63, "learning_rate": 1.5185478617142983e-05, "loss": 2.3908, "step": 80 }, { "epoch": 0.71, "learning_rate": 1.561892416495773e-05, "loss": 2.3321, "step": 90 }, { "epoch": 0.79, "learning_rate": 1.600463626286153e-05, "loss": 2.3622, "step": 100 }, { "epoch": 0.79, "eval_loss": 2.0142996311187744, "eval_runtime": 13.3285, "eval_samples_per_second": 92.958, "eval_steps_per_second": 18.607, "step": 100 }, { "epoch": 0.87, "learning_rate": 1.6352093070986755e-05, "loss": 2.2679, "step": 110 }, { "epoch": 0.95, "learning_rate": 1.666820294156779e-05, "loss": 2.2493, "step": 120 }, { "epoch": 1.03, "learning_rate": 1.6958156901812732e-05, "loss": 2.3766, "step": 130 }, { "epoch": 1.11, "learning_rate": 1.7225954557575116e-05, "loss": 2.1449, "step": 140 }, { "epoch": 1.19, "learning_rate": 1.747474292314672e-05, "loss": 2.1136, "step": 150 }, { "epoch": 1.27, "learning_rate": 1.770704275518123e-05, "loss": 2.107, "step": 160 }, { "epoch": 1.35, "learning_rate": 1.7924904433402933e-05, "loss": 2.1469, "step": 170 }, { "epoch": 1.43, "learning_rate": 1.8130018169564944e-05, "loss": 2.1229, "step": 180 }, { "epoch": 1.5, "learning_rate": 1.832379371735486e-05, "loss": 2.0893, "step": 190 }, { "epoch": 1.58, "learning_rate": 1.8507419182561513e-05, "loss": 2.0836, "step": 200 }, { "epoch": 1.58, "eval_loss": 1.813197135925293, "eval_runtime": 13.3546, "eval_samples_per_second": 92.777, "eval_steps_per_second": 18.57, "step": 200 }, { "epoch": 1.66, "learning_rate": 1.868190518296623e-05, "loss": 2.0307, "step": 210 }, { "epoch": 1.74, "learning_rate": 1.8848118530355293e-05, "loss": 2.0025, "step": 220 }, { "epoch": 1.82, "learning_rate": 1.9006808283301558e-05, "loss": 2.0116, "step": 230 }, { "epoch": 1.9, "learning_rate": 1.915862615484805e-05, "loss": 2.0355, "step": 240 }, { "epoch": 1.98, "learning_rate": 1.9304142682139013e-05, "loss": 2.0056, "step": 250 }, { "epoch": 2.06, "learning_rate": 1.945753054066002e-05, "loss": 2.1508, "step": 260 }, { "epoch": 2.14, "learning_rate": 1.959138057324679e-05, "loss": 1.9203, "step": 270 }, { "epoch": 2.22, "learning_rate": 1.9720308518695147e-05, "loss": 1.9358, "step": 280 }, { "epoch": 2.3, "learning_rate": 1.9844663575115566e-05, "loss": 1.877, "step": 290 }, { "epoch": 2.38, "learning_rate": 1.9964759049286476e-05, "loss": 1.9239, "step": 300 }, { "epoch": 2.38, "eval_loss": 1.726140022277832, "eval_runtime": 13.3534, "eval_samples_per_second": 92.785, "eval_steps_per_second": 18.572, "step": 300 }, { "epoch": 2.46, "learning_rate": 1.994594594594595e-05, "loss": 1.9449, "step": 310 }, { "epoch": 2.54, "learning_rate": 1.9855855855855857e-05, "loss": 1.895, "step": 320 }, { "epoch": 2.62, "learning_rate": 1.9765765765765768e-05, "loss": 1.864, "step": 330 }, { "epoch": 2.69, "learning_rate": 1.967567567567568e-05, "loss": 1.8928, "step": 340 }, { "epoch": 2.77, "learning_rate": 1.9585585585585586e-05, "loss": 1.8801, "step": 350 }, { "epoch": 2.85, "learning_rate": 1.9495495495495497e-05, "loss": 1.911, "step": 360 }, { "epoch": 2.93, "learning_rate": 1.9405405405405408e-05, "loss": 1.8945, "step": 370 }, { "epoch": 3.02, "learning_rate": 1.930630630630631e-05, "loss": 1.9508, "step": 380 }, { "epoch": 3.09, "learning_rate": 1.9216216216216216e-05, "loss": 1.7709, "step": 390 }, { "epoch": 3.17, "learning_rate": 1.9126126126126127e-05, "loss": 1.7693, "step": 400 }, { "epoch": 3.17, "eval_loss": 1.673058032989502, "eval_runtime": 13.3468, "eval_samples_per_second": 92.832, "eval_steps_per_second": 18.581, "step": 400 }, { "epoch": 3.25, "learning_rate": 1.9036036036036038e-05, "loss": 1.7773, "step": 410 }, { "epoch": 3.33, "learning_rate": 1.8945945945945945e-05, "loss": 1.8198, "step": 420 }, { "epoch": 3.41, "learning_rate": 1.8855855855855856e-05, "loss": 1.7813, "step": 430 }, { "epoch": 3.49, "learning_rate": 1.8765765765765767e-05, "loss": 1.7736, "step": 440 }, { "epoch": 3.57, "learning_rate": 1.8675675675675678e-05, "loss": 1.8544, "step": 450 }, { "epoch": 3.65, "learning_rate": 1.8585585585585585e-05, "loss": 1.7672, "step": 460 }, { "epoch": 3.73, "learning_rate": 1.8495495495495496e-05, "loss": 1.8223, "step": 470 }, { "epoch": 3.8, "learning_rate": 1.8405405405405407e-05, "loss": 1.7832, "step": 480 }, { "epoch": 3.88, "learning_rate": 1.8315315315315318e-05, "loss": 1.7937, "step": 490 }, { "epoch": 3.96, "learning_rate": 1.822522522522523e-05, "loss": 1.8078, "step": 500 }, { "epoch": 3.96, "eval_loss": 1.6267061233520508, "eval_runtime": 13.3535, "eval_samples_per_second": 92.784, "eval_steps_per_second": 18.572, "step": 500 }, { "epoch": 4.05, "learning_rate": 1.8126126126126127e-05, "loss": 1.8374, "step": 510 }, { "epoch": 4.13, "learning_rate": 1.8036036036036037e-05, "loss": 1.7228, "step": 520 }, { "epoch": 4.21, "learning_rate": 1.7945945945945948e-05, "loss": 1.6986, "step": 530 }, { "epoch": 4.28, "learning_rate": 1.7855855855855856e-05, "loss": 1.6858, "step": 540 }, { "epoch": 4.36, "learning_rate": 1.7765765765765767e-05, "loss": 1.7169, "step": 550 }, { "epoch": 4.44, "learning_rate": 1.7675675675675677e-05, "loss": 1.7069, "step": 560 }, { "epoch": 4.52, "learning_rate": 1.7585585585585588e-05, "loss": 1.6725, "step": 570 }, { "epoch": 4.6, "learning_rate": 1.7495495495495496e-05, "loss": 1.7052, "step": 580 }, { "epoch": 4.68, "learning_rate": 1.7405405405405406e-05, "loss": 1.6597, "step": 590 }, { "epoch": 4.76, "learning_rate": 1.7315315315315317e-05, "loss": 1.7565, "step": 600 }, { "epoch": 4.76, "eval_loss": 1.5994915962219238, "eval_runtime": 13.2972, "eval_samples_per_second": 93.177, "eval_steps_per_second": 18.651, "step": 600 }, { "epoch": 4.84, "learning_rate": 1.7225225225225225e-05, "loss": 1.723, "step": 610 }, { "epoch": 4.92, "learning_rate": 1.7135135135135135e-05, "loss": 1.6777, "step": 620 }, { "epoch": 4.99, "learning_rate": 1.7045045045045046e-05, "loss": 1.6943, "step": 630 }, { "epoch": 5.08, "learning_rate": 1.6954954954954957e-05, "loss": 1.7274, "step": 640 }, { "epoch": 5.16, "learning_rate": 1.6864864864864868e-05, "loss": 1.7062, "step": 650 }, { "epoch": 5.24, "learning_rate": 1.6774774774774775e-05, "loss": 1.6078, "step": 660 }, { "epoch": 5.32, "learning_rate": 1.6684684684684686e-05, "loss": 1.6484, "step": 670 }, { "epoch": 5.39, "learning_rate": 1.6594594594594597e-05, "loss": 1.6097, "step": 680 }, { "epoch": 5.47, "learning_rate": 1.6504504504504508e-05, "loss": 1.6255, "step": 690 }, { "epoch": 5.55, "learning_rate": 1.641441441441442e-05, "loss": 1.6188, "step": 700 }, { "epoch": 5.55, "eval_loss": 1.5767285823822021, "eval_runtime": 13.2952, "eval_samples_per_second": 93.191, "eval_steps_per_second": 18.653, "step": 700 }, { "epoch": 5.63, "learning_rate": 1.6324324324324326e-05, "loss": 1.68, "step": 710 }, { "epoch": 5.71, "learning_rate": 1.6234234234234237e-05, "loss": 1.618, "step": 720 }, { "epoch": 5.79, "learning_rate": 1.6144144144144144e-05, "loss": 1.5747, "step": 730 }, { "epoch": 5.87, "learning_rate": 1.6054054054054055e-05, "loss": 1.6617, "step": 740 }, { "epoch": 5.95, "learning_rate": 1.5963963963963966e-05, "loss": 1.6466, "step": 750 }, { "epoch": 6.03, "learning_rate": 1.5864864864864867e-05, "loss": 1.7621, "step": 760 }, { "epoch": 6.11, "learning_rate": 1.5774774774774778e-05, "loss": 1.5679, "step": 770 }, { "epoch": 6.19, "learning_rate": 1.5684684684684685e-05, "loss": 1.6356, "step": 780 }, { "epoch": 6.27, "learning_rate": 1.5594594594594596e-05, "loss": 1.5499, "step": 790 }, { "epoch": 6.35, "learning_rate": 1.5504504504504504e-05, "loss": 1.5783, "step": 800 }, { "epoch": 6.35, "eval_loss": 1.5619192123413086, "eval_runtime": 13.2986, "eval_samples_per_second": 93.168, "eval_steps_per_second": 18.649, "step": 800 }, { "epoch": 6.43, "learning_rate": 1.5414414414414414e-05, "loss": 1.5848, "step": 810 }, { "epoch": 6.5, "learning_rate": 1.5324324324324325e-05, "loss": 1.5019, "step": 820 }, { "epoch": 6.58, "learning_rate": 1.5234234234234236e-05, "loss": 1.6008, "step": 830 }, { "epoch": 6.66, "learning_rate": 1.5144144144144147e-05, "loss": 1.5757, "step": 840 }, { "epoch": 6.74, "learning_rate": 1.5054054054054054e-05, "loss": 1.5855, "step": 850 }, { "epoch": 6.82, "learning_rate": 1.4963963963963965e-05, "loss": 1.629, "step": 860 }, { "epoch": 6.9, "learning_rate": 1.4873873873873874e-05, "loss": 1.5948, "step": 870 }, { "epoch": 6.98, "learning_rate": 1.4783783783783785e-05, "loss": 1.5814, "step": 880 }, { "epoch": 7.06, "learning_rate": 1.4684684684684686e-05, "loss": 1.7112, "step": 890 }, { "epoch": 7.14, "learning_rate": 1.4594594594594596e-05, "loss": 1.4981, "step": 900 }, { "epoch": 7.14, "eval_loss": 1.546679139137268, "eval_runtime": 13.2814, "eval_samples_per_second": 93.288, "eval_steps_per_second": 18.673, "step": 900 }, { "epoch": 7.22, "learning_rate": 1.4504504504504506e-05, "loss": 1.5341, "step": 910 }, { "epoch": 7.3, "learning_rate": 1.4414414414414416e-05, "loss": 1.5301, "step": 920 }, { "epoch": 7.38, "learning_rate": 1.4324324324324326e-05, "loss": 1.5253, "step": 930 }, { "epoch": 7.46, "learning_rate": 1.4234234234234234e-05, "loss": 1.5697, "step": 940 }, { "epoch": 7.54, "learning_rate": 1.4144144144144145e-05, "loss": 1.5482, "step": 950 }, { "epoch": 7.62, "learning_rate": 1.4054054054054055e-05, "loss": 1.4849, "step": 960 }, { "epoch": 7.69, "learning_rate": 1.3963963963963964e-05, "loss": 1.536, "step": 970 }, { "epoch": 7.77, "learning_rate": 1.3873873873873875e-05, "loss": 1.5267, "step": 980 }, { "epoch": 7.85, "learning_rate": 1.3783783783783784e-05, "loss": 1.5003, "step": 990 }, { "epoch": 7.93, "learning_rate": 1.3693693693693695e-05, "loss": 1.5296, "step": 1000 }, { "epoch": 7.93, "eval_loss": 1.5358564853668213, "eval_runtime": 13.2903, "eval_samples_per_second": 93.226, "eval_steps_per_second": 18.66, "step": 1000 }, { "epoch": 8.02, "learning_rate": 1.3594594594594597e-05, "loss": 1.6728, "step": 1010 }, { "epoch": 8.09, "learning_rate": 1.3504504504504506e-05, "loss": 1.4944, "step": 1020 }, { "epoch": 8.17, "learning_rate": 1.3414414414414417e-05, "loss": 1.5016, "step": 1030 }, { "epoch": 8.25, "learning_rate": 1.3324324324324324e-05, "loss": 1.5407, "step": 1040 }, { "epoch": 8.33, "learning_rate": 1.3234234234234235e-05, "loss": 1.5129, "step": 1050 }, { "epoch": 8.41, "learning_rate": 1.3144144144144144e-05, "loss": 1.4774, "step": 1060 }, { "epoch": 8.49, "learning_rate": 1.3054054054054055e-05, "loss": 1.5181, "step": 1070 }, { "epoch": 8.57, "learning_rate": 1.2963963963963966e-05, "loss": 1.4897, "step": 1080 }, { "epoch": 8.65, "learning_rate": 1.2873873873873875e-05, "loss": 1.4918, "step": 1090 }, { "epoch": 8.73, "learning_rate": 1.2783783783783785e-05, "loss": 1.4734, "step": 1100 }, { "epoch": 8.73, "eval_loss": 1.5299837589263916, "eval_runtime": 13.3097, "eval_samples_per_second": 93.09, "eval_steps_per_second": 18.633, "step": 1100 }, { "epoch": 8.8, "learning_rate": 1.2693693693693695e-05, "loss": 1.4425, "step": 1110 }, { "epoch": 8.88, "learning_rate": 1.2603603603603605e-05, "loss": 1.5102, "step": 1120 }, { "epoch": 8.96, "learning_rate": 1.2513513513513516e-05, "loss": 1.4614, "step": 1130 }, { "epoch": 9.05, "learning_rate": 1.2423423423423424e-05, "loss": 1.5378, "step": 1140 }, { "epoch": 9.13, "learning_rate": 1.2333333333333334e-05, "loss": 1.5182, "step": 1150 }, { "epoch": 9.21, "learning_rate": 1.2243243243243244e-05, "loss": 1.4606, "step": 1160 }, { "epoch": 9.28, "learning_rate": 1.2153153153153154e-05, "loss": 1.4675, "step": 1170 }, { "epoch": 9.36, "learning_rate": 1.2063063063063063e-05, "loss": 1.5261, "step": 1180 }, { "epoch": 9.44, "learning_rate": 1.1972972972972974e-05, "loss": 1.4602, "step": 1190 }, { "epoch": 9.52, "learning_rate": 1.1882882882882885e-05, "loss": 1.4415, "step": 1200 }, { "epoch": 9.52, "eval_loss": 1.521682620048523, "eval_runtime": 13.2939, "eval_samples_per_second": 93.2, "eval_steps_per_second": 18.655, "step": 1200 }, { "epoch": 9.6, "learning_rate": 1.1792792792792792e-05, "loss": 1.4601, "step": 1210 }, { "epoch": 9.68, "learning_rate": 1.1702702702702703e-05, "loss": 1.4676, "step": 1220 }, { "epoch": 9.76, "learning_rate": 1.1612612612612612e-05, "loss": 1.4213, "step": 1230 }, { "epoch": 9.84, "learning_rate": 1.1522522522522523e-05, "loss": 1.4096, "step": 1240 }, { "epoch": 9.92, "learning_rate": 1.1432432432432434e-05, "loss": 1.4475, "step": 1250 }, { "epoch": 9.99, "learning_rate": 1.1342342342342343e-05, "loss": 1.4477, "step": 1260 }, { "epoch": 10.08, "learning_rate": 1.1243243243243245e-05, "loss": 1.4979, "step": 1270 }, { "epoch": 10.16, "learning_rate": 1.1153153153153154e-05, "loss": 1.4436, "step": 1280 }, { "epoch": 10.24, "learning_rate": 1.1063063063063065e-05, "loss": 1.4913, "step": 1290 }, { "epoch": 10.32, "learning_rate": 1.0972972972972974e-05, "loss": 1.4513, "step": 1300 }, { "epoch": 10.32, "eval_loss": 1.5171560049057007, "eval_runtime": 13.2984, "eval_samples_per_second": 93.169, "eval_steps_per_second": 18.649, "step": 1300 }, { "epoch": 10.39, "learning_rate": 1.0882882882882884e-05, "loss": 1.4649, "step": 1310 }, { "epoch": 10.47, "learning_rate": 1.0792792792792795e-05, "loss": 1.4126, "step": 1320 }, { "epoch": 10.55, "learning_rate": 1.0702702702702703e-05, "loss": 1.3916, "step": 1330 }, { "epoch": 10.63, "learning_rate": 1.0612612612612613e-05, "loss": 1.393, "step": 1340 }, { "epoch": 10.71, "learning_rate": 1.0522522522522523e-05, "loss": 1.3972, "step": 1350 }, { "epoch": 10.79, "learning_rate": 1.0432432432432433e-05, "loss": 1.4867, "step": 1360 }, { "epoch": 10.87, "learning_rate": 1.0342342342342344e-05, "loss": 1.4109, "step": 1370 }, { "epoch": 10.95, "learning_rate": 1.0252252252252253e-05, "loss": 1.4215, "step": 1380 }, { "epoch": 11.03, "learning_rate": 1.0153153153153155e-05, "loss": 1.5288, "step": 1390 }, { "epoch": 11.11, "learning_rate": 1.0063063063063064e-05, "loss": 1.3782, "step": 1400 }, { "epoch": 11.11, "eval_loss": 1.5077377557754517, "eval_runtime": 13.2758, "eval_samples_per_second": 93.328, "eval_steps_per_second": 18.681, "step": 1400 }, { "epoch": 11.19, "learning_rate": 9.972972972972975e-06, "loss": 1.4292, "step": 1410 }, { "epoch": 11.27, "learning_rate": 9.882882882882884e-06, "loss": 1.4404, "step": 1420 }, { "epoch": 11.35, "learning_rate": 9.792792792792793e-06, "loss": 1.4192, "step": 1430 }, { "epoch": 11.43, "learning_rate": 9.702702702702704e-06, "loss": 1.324, "step": 1440 }, { "epoch": 11.5, "learning_rate": 9.612612612612613e-06, "loss": 1.4342, "step": 1450 }, { "epoch": 11.58, "learning_rate": 9.522522522522524e-06, "loss": 1.4233, "step": 1460 }, { "epoch": 11.66, "learning_rate": 9.432432432432433e-06, "loss": 1.3475, "step": 1470 }, { "epoch": 11.74, "learning_rate": 9.342342342342344e-06, "loss": 1.4178, "step": 1480 }, { "epoch": 11.82, "learning_rate": 9.252252252252253e-06, "loss": 1.3842, "step": 1490 }, { "epoch": 11.9, "learning_rate": 9.162162162162162e-06, "loss": 1.4103, "step": 1500 }, { "epoch": 11.9, "eval_loss": 1.507853388786316, "eval_runtime": 13.2911, "eval_samples_per_second": 93.22, "eval_steps_per_second": 18.659, "step": 1500 }, { "epoch": 11.98, "learning_rate": 9.072072072072073e-06, "loss": 1.3918, "step": 1510 }, { "epoch": 12.06, "learning_rate": 8.972972972972974e-06, "loss": 1.4841, "step": 1520 }, { "epoch": 12.14, "learning_rate": 8.882882882882883e-06, "loss": 1.3866, "step": 1530 }, { "epoch": 12.22, "learning_rate": 8.792792792792794e-06, "loss": 1.3713, "step": 1540 }, { "epoch": 12.3, "learning_rate": 8.702702702702703e-06, "loss": 1.3384, "step": 1550 }, { "epoch": 12.38, "learning_rate": 8.612612612612612e-06, "loss": 1.4079, "step": 1560 }, { "epoch": 12.46, "learning_rate": 8.522522522522523e-06, "loss": 1.3715, "step": 1570 }, { "epoch": 12.54, "learning_rate": 8.432432432432434e-06, "loss": 1.3541, "step": 1580 }, { "epoch": 12.62, "learning_rate": 8.342342342342343e-06, "loss": 1.3764, "step": 1590 }, { "epoch": 12.69, "learning_rate": 8.252252252252254e-06, "loss": 1.3907, "step": 1600 }, { "epoch": 12.69, "eval_loss": 1.5033966302871704, "eval_runtime": 13.2864, "eval_samples_per_second": 93.254, "eval_steps_per_second": 18.666, "step": 1600 }, { "epoch": 12.77, "learning_rate": 8.162162162162163e-06, "loss": 1.3626, "step": 1610 }, { "epoch": 12.85, "learning_rate": 8.072072072072072e-06, "loss": 1.4094, "step": 1620 }, { "epoch": 12.93, "learning_rate": 7.981981981981983e-06, "loss": 1.3579, "step": 1630 }, { "epoch": 13.02, "learning_rate": 7.891891891891894e-06, "loss": 1.4517, "step": 1640 }, { "epoch": 13.09, "learning_rate": 7.801801801801803e-06, "loss": 1.301, "step": 1650 }, { "epoch": 13.17, "learning_rate": 7.711711711711712e-06, "loss": 1.3513, "step": 1660 }, { "epoch": 13.25, "learning_rate": 7.621621621621622e-06, "loss": 1.3487, "step": 1670 }, { "epoch": 13.33, "learning_rate": 7.531531531531532e-06, "loss": 1.3894, "step": 1680 }, { "epoch": 13.41, "learning_rate": 7.441441441441442e-06, "loss": 1.3619, "step": 1690 }, { "epoch": 13.49, "learning_rate": 7.3513513513513525e-06, "loss": 1.3663, "step": 1700 }, { "epoch": 13.49, "eval_loss": 1.501574993133545, "eval_runtime": 13.3092, "eval_samples_per_second": 93.094, "eval_steps_per_second": 18.634, "step": 1700 }, { "epoch": 13.57, "learning_rate": 7.2612612612612625e-06, "loss": 1.3268, "step": 1710 }, { "epoch": 13.65, "learning_rate": 7.1711711711711716e-06, "loss": 1.3703, "step": 1720 }, { "epoch": 13.73, "learning_rate": 7.0810810810810815e-06, "loss": 1.4246, "step": 1730 }, { "epoch": 13.8, "learning_rate": 6.9909909909909915e-06, "loss": 1.3642, "step": 1740 }, { "epoch": 13.88, "learning_rate": 6.900900900900901e-06, "loss": 1.3467, "step": 1750 }, { "epoch": 13.96, "learning_rate": 6.810810810810811e-06, "loss": 1.3802, "step": 1760 }, { "epoch": 14.05, "learning_rate": 6.711711711711713e-06, "loss": 1.4328, "step": 1770 }, { "epoch": 14.13, "learning_rate": 6.621621621621622e-06, "loss": 1.3316, "step": 1780 }, { "epoch": 14.21, "learning_rate": 6.531531531531532e-06, "loss": 1.3634, "step": 1790 }, { "epoch": 14.28, "learning_rate": 6.441441441441442e-06, "loss": 1.3565, "step": 1800 }, { "epoch": 14.28, "eval_loss": 1.4980181455612183, "eval_runtime": 13.2875, "eval_samples_per_second": 93.246, "eval_steps_per_second": 18.664, "step": 1800 }, { "epoch": 14.36, "learning_rate": 6.351351351351351e-06, "loss": 1.3217, "step": 1810 }, { "epoch": 14.44, "learning_rate": 6.261261261261262e-06, "loss": 1.33, "step": 1820 }, { "epoch": 14.52, "learning_rate": 6.171171171171172e-06, "loss": 1.3513, "step": 1830 }, { "epoch": 14.6, "learning_rate": 6.081081081081082e-06, "loss": 1.3649, "step": 1840 }, { "epoch": 14.68, "learning_rate": 5.990990990990992e-06, "loss": 1.3462, "step": 1850 }, { "epoch": 14.76, "learning_rate": 5.900900900900901e-06, "loss": 1.3454, "step": 1860 }, { "epoch": 14.84, "learning_rate": 5.810810810810811e-06, "loss": 1.3316, "step": 1870 }, { "epoch": 14.92, "learning_rate": 5.720720720720722e-06, "loss": 1.3347, "step": 1880 }, { "epoch": 14.99, "learning_rate": 5.6306306306306316e-06, "loss": 1.3039, "step": 1890 }, { "epoch": 15.08, "learning_rate": 5.531531531531532e-06, "loss": 1.4057, "step": 1900 }, { "epoch": 15.08, "eval_loss": 1.4985554218292236, "eval_runtime": 13.2747, "eval_samples_per_second": 93.335, "eval_steps_per_second": 18.682, "step": 1900 }, { "epoch": 15.16, "learning_rate": 5.441441441441442e-06, "loss": 1.3082, "step": 1910 }, { "epoch": 15.24, "learning_rate": 5.351351351351351e-06, "loss": 1.3589, "step": 1920 }, { "epoch": 15.32, "learning_rate": 5.261261261261261e-06, "loss": 1.3235, "step": 1930 }, { "epoch": 15.39, "learning_rate": 5.171171171171172e-06, "loss": 1.3153, "step": 1940 }, { "epoch": 15.47, "learning_rate": 5.081081081081082e-06, "loss": 1.3345, "step": 1950 }, { "epoch": 15.55, "learning_rate": 4.990990990990991e-06, "loss": 1.3824, "step": 1960 }, { "epoch": 15.63, "learning_rate": 4.900900900900901e-06, "loss": 1.291, "step": 1970 }, { "epoch": 15.71, "learning_rate": 4.810810810810811e-06, "loss": 1.3106, "step": 1980 }, { "epoch": 15.79, "learning_rate": 4.720720720720721e-06, "loss": 1.3559, "step": 1990 }, { "epoch": 15.87, "learning_rate": 4.630630630630631e-06, "loss": 1.3406, "step": 2000 }, { "epoch": 15.87, "eval_loss": 1.4952143430709839, "eval_runtime": 13.2782, "eval_samples_per_second": 93.31, "eval_steps_per_second": 18.677, "step": 2000 }, { "epoch": 15.95, "learning_rate": 4.540540540540541e-06, "loss": 1.328, "step": 2010 }, { "epoch": 16.03, "learning_rate": 4.441441441441442e-06, "loss": 1.4218, "step": 2020 }, { "epoch": 16.11, "learning_rate": 4.351351351351352e-06, "loss": 1.2962, "step": 2030 }, { "epoch": 16.19, "learning_rate": 4.2612612612612615e-06, "loss": 1.3122, "step": 2040 }, { "epoch": 16.27, "learning_rate": 4.1711711711711715e-06, "loss": 1.3641, "step": 2050 }, { "epoch": 16.35, "learning_rate": 4.0810810810810815e-06, "loss": 1.3058, "step": 2060 }, { "epoch": 16.43, "learning_rate": 3.990990990990991e-06, "loss": 1.2986, "step": 2070 }, { "epoch": 16.5, "learning_rate": 3.900900900900901e-06, "loss": 1.2902, "step": 2080 }, { "epoch": 16.58, "learning_rate": 3.810810810810811e-06, "loss": 1.3725, "step": 2090 }, { "epoch": 16.66, "learning_rate": 3.720720720720721e-06, "loss": 1.3031, "step": 2100 }, { "epoch": 16.66, "eval_loss": 1.495803713798523, "eval_runtime": 13.2846, "eval_samples_per_second": 93.266, "eval_steps_per_second": 18.668, "step": 2100 }, { "epoch": 16.74, "learning_rate": 3.6306306306306312e-06, "loss": 1.3091, "step": 2110 }, { "epoch": 16.82, "learning_rate": 3.5405405405405408e-06, "loss": 1.3003, "step": 2120 }, { "epoch": 16.9, "learning_rate": 3.4504504504504503e-06, "loss": 1.2694, "step": 2130 }, { "epoch": 16.98, "learning_rate": 3.3603603603603607e-06, "loss": 1.3349, "step": 2140 }, { "epoch": 17.06, "learning_rate": 3.2702702702702706e-06, "loss": 1.3553, "step": 2150 }, { "epoch": 17.14, "learning_rate": 3.1801801801801806e-06, "loss": 1.2844, "step": 2160 }, { "epoch": 17.22, "learning_rate": 3.0900900900900905e-06, "loss": 1.2815, "step": 2170 }, { "epoch": 17.3, "learning_rate": 3e-06, "loss": 1.2756, "step": 2180 }, { "epoch": 17.38, "learning_rate": 2.9099099099099105e-06, "loss": 1.33, "step": 2190 }, { "epoch": 17.46, "learning_rate": 2.81981981981982e-06, "loss": 1.31, "step": 2200 }, { "epoch": 17.46, "eval_loss": 1.4959148168563843, "eval_runtime": 13.2926, "eval_samples_per_second": 93.21, "eval_steps_per_second": 18.657, "step": 2200 }, { "epoch": 17.54, "learning_rate": 2.72972972972973e-06, "loss": 1.3306, "step": 2210 }, { "epoch": 17.62, "learning_rate": 2.63963963963964e-06, "loss": 1.331, "step": 2220 }, { "epoch": 17.69, "learning_rate": 2.54954954954955e-06, "loss": 1.3347, "step": 2230 }, { "epoch": 17.77, "learning_rate": 2.45945945945946e-06, "loss": 1.3291, "step": 2240 }, { "epoch": 17.85, "learning_rate": 2.3693693693693693e-06, "loss": 1.2774, "step": 2250 }, { "epoch": 17.93, "learning_rate": 2.2792792792792793e-06, "loss": 1.3102, "step": 2260 }, { "epoch": 18.02, "learning_rate": 2.1801801801801804e-06, "loss": 1.4393, "step": 2270 }, { "epoch": 18.09, "learning_rate": 2.0900900900900904e-06, "loss": 1.2779, "step": 2280 }, { "epoch": 18.17, "learning_rate": 2.0000000000000003e-06, "loss": 1.2708, "step": 2290 }, { "epoch": 18.25, "learning_rate": 1.90990990990991e-06, "loss": 1.3565, "step": 2300 }, { "epoch": 18.25, "eval_loss": 1.494253396987915, "eval_runtime": 13.2974, "eval_samples_per_second": 93.176, "eval_steps_per_second": 18.65, "step": 2300 }, { "epoch": 18.33, "learning_rate": 1.81981981981982e-06, "loss": 1.2744, "step": 2310 }, { "epoch": 18.41, "learning_rate": 1.72972972972973e-06, "loss": 1.2697, "step": 2320 }, { "epoch": 18.49, "learning_rate": 1.6396396396396397e-06, "loss": 1.2921, "step": 2330 }, { "epoch": 18.57, "learning_rate": 1.5495495495495497e-06, "loss": 1.3096, "step": 2340 }, { "epoch": 18.65, "learning_rate": 1.4594594594594596e-06, "loss": 1.2971, "step": 2350 }, { "epoch": 18.73, "learning_rate": 1.3693693693693694e-06, "loss": 1.3132, "step": 2360 }, { "epoch": 18.8, "learning_rate": 1.2792792792792793e-06, "loss": 1.3367, "step": 2370 }, { "epoch": 18.88, "learning_rate": 1.1891891891891893e-06, "loss": 1.2648, "step": 2380 }, { "epoch": 18.96, "learning_rate": 1.0990990990990993e-06, "loss": 1.3025, "step": 2390 }, { "epoch": 19.05, "learning_rate": 1.0000000000000002e-06, "loss": 1.3732, "step": 2400 }, { "epoch": 19.05, "eval_loss": 1.4953521490097046, "eval_runtime": 13.2927, "eval_samples_per_second": 93.209, "eval_steps_per_second": 18.657, "step": 2400 }, { "epoch": 19.13, "learning_rate": 9.0990990990991e-07, "loss": 1.2938, "step": 2410 }, { "epoch": 19.21, "learning_rate": 8.198198198198199e-07, "loss": 1.3374, "step": 2420 }, { "epoch": 19.28, "learning_rate": 7.297297297297298e-07, "loss": 1.297, "step": 2430 }, { "epoch": 19.36, "learning_rate": 6.396396396396397e-07, "loss": 1.2748, "step": 2440 }, { "epoch": 19.44, "learning_rate": 5.495495495495496e-07, "loss": 1.3259, "step": 2450 }, { "epoch": 19.52, "learning_rate": 4.5945945945945953e-07, "loss": 1.3099, "step": 2460 }, { "epoch": 19.6, "learning_rate": 3.693693693693694e-07, "loss": 1.3277, "step": 2470 }, { "epoch": 19.68, "learning_rate": 2.792792792792793e-07, "loss": 1.2708, "step": 2480 }, { "epoch": 19.76, "learning_rate": 1.8918918918918921e-07, "loss": 1.2546, "step": 2490 }, { "epoch": 19.84, "learning_rate": 9.90990990990991e-08, "loss": 1.2705, "step": 2500 }, { "epoch": 19.84, "eval_loss": 1.494584560394287, "eval_runtime": 13.2864, "eval_samples_per_second": 93.253, "eval_steps_per_second": 18.666, "step": 2500 } ], "max_steps": 2520, "num_train_epochs": 20, "total_flos": 5.993865079244718e+17, "trial_name": null, "trial_params": null }