{ "best_metric": null, "best_model_checkpoint": null, "epoch": 492.45432883240665, "global_step": 620000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4, "learning_rate": 1.999205718824464e-05, "loss": 25.5103, "step": 500 }, { "epoch": 0.79, "learning_rate": 1.9984114376489278e-05, "loss": 18.4925, "step": 1000 }, { "epoch": 1.0, "eval_loss": 14.646448135375977, "eval_runtime": 70.1643, "eval_samples_per_second": 6121.707, "eval_steps_per_second": 1.995, "step": 1259 }, { "epoch": 1.19, "learning_rate": 1.9976171564733916e-05, "loss": 15.0445, "step": 1500 }, { "epoch": 1.59, "learning_rate": 1.9968228752978554e-05, "loss": 12.5296, "step": 2000 }, { "epoch": 1.99, "learning_rate": 1.9960285941223196e-05, "loss": 10.6706, "step": 2500 }, { "epoch": 2.0, "eval_loss": 9.694795608520508, "eval_runtime": 67.5462, "eval_samples_per_second": 6358.978, "eval_steps_per_second": 2.073, "step": 2518 }, { "epoch": 2.38, "learning_rate": 1.9952343129467834e-05, "loss": 9.2923, "step": 3000 }, { "epoch": 2.78, "learning_rate": 1.9944400317712472e-05, "loss": 8.3023, "step": 3500 }, { "epoch": 3.0, "eval_loss": 7.442232608795166, "eval_runtime": 68.0543, "eval_samples_per_second": 6311.503, "eval_steps_per_second": 2.057, "step": 3777 }, { "epoch": 3.18, "learning_rate": 1.993645750595711e-05, "loss": 7.6019, "step": 4000 }, { "epoch": 3.57, "learning_rate": 1.9928514694201748e-05, "loss": 7.1126, "step": 4500 }, { "epoch": 3.97, "learning_rate": 1.9920571882446386e-05, "loss": 6.7538, "step": 5000 }, { "epoch": 4.0, "eval_loss": 6.4947967529296875, "eval_runtime": 67.4445, "eval_samples_per_second": 6368.567, "eval_steps_per_second": 2.076, "step": 5036 }, { "epoch": 4.37, "learning_rate": 1.9912629070691024e-05, "loss": 6.4873, "step": 5500 }, { "epoch": 4.77, "learning_rate": 1.9904686258935662e-05, "loss": 6.2941, "step": 6000 }, { "epoch": 5.0, "eval_loss": 6.058001518249512, "eval_runtime": 66.6412, "eval_samples_per_second": 6445.335, "eval_steps_per_second": 2.101, "step": 6295 }, { "epoch": 5.16, "learning_rate": 1.9896743447180304e-05, "loss": 6.1608, "step": 6500 }, { "epoch": 5.56, "learning_rate": 1.9888800635424942e-05, "loss": 6.0606, "step": 7000 }, { "epoch": 5.96, "learning_rate": 1.988085782366958e-05, "loss": 5.9841, "step": 7500 }, { "epoch": 6.0, "eval_loss": 5.85178279876709, "eval_runtime": 67.6703, "eval_samples_per_second": 6347.319, "eval_steps_per_second": 2.069, "step": 7554 }, { "epoch": 6.35, "learning_rate": 1.987291501191422e-05, "loss": 5.9144, "step": 8000 }, { "epoch": 6.75, "learning_rate": 1.986497220015886e-05, "loss": 5.8638, "step": 8500 }, { "epoch": 7.0, "eval_loss": 5.727705955505371, "eval_runtime": 66.5854, "eval_samples_per_second": 6450.735, "eval_steps_per_second": 2.103, "step": 8813 }, { "epoch": 7.15, "learning_rate": 1.9857029388403498e-05, "loss": 5.8139, "step": 9000 }, { "epoch": 7.55, "learning_rate": 1.9849086576648136e-05, "loss": 5.7603, "step": 9500 }, { "epoch": 7.94, "learning_rate": 1.9841143764892774e-05, "loss": 5.6951, "step": 10000 }, { "epoch": 8.0, "eval_loss": 5.557251453399658, "eval_runtime": 66.7182, "eval_samples_per_second": 6437.901, "eval_steps_per_second": 2.098, "step": 10072 }, { "epoch": 8.34, "learning_rate": 1.9833200953137412e-05, "loss": 5.6463, "step": 10500 }, { "epoch": 8.74, "learning_rate": 1.982525814138205e-05, "loss": 5.6046, "step": 11000 }, { "epoch": 9.0, "eval_loss": 5.441849231719971, "eval_runtime": 69.0026, "eval_samples_per_second": 6224.763, "eval_steps_per_second": 2.029, "step": 11331 }, { "epoch": 9.13, "learning_rate": 1.981731532962669e-05, "loss": 5.5704, "step": 11500 }, { "epoch": 9.53, "learning_rate": 1.9809372517871326e-05, "loss": 5.5179, "step": 12000 }, { "epoch": 9.93, "learning_rate": 1.9801429706115968e-05, "loss": 5.4583, "step": 12500 }, { "epoch": 10.0, "eval_loss": 5.300734519958496, "eval_runtime": 67.6172, "eval_samples_per_second": 6352.303, "eval_steps_per_second": 2.07, "step": 12590 }, { "epoch": 10.33, "learning_rate": 1.9793486894360606e-05, "loss": 5.4088, "step": 13000 }, { "epoch": 10.72, "learning_rate": 1.9785544082605244e-05, "loss": 5.3664, "step": 13500 }, { "epoch": 11.0, "eval_loss": 5.175403594970703, "eval_runtime": 66.4605, "eval_samples_per_second": 6462.863, "eval_steps_per_second": 2.107, "step": 13849 }, { "epoch": 11.12, "learning_rate": 1.9777601270849882e-05, "loss": 5.325, "step": 14000 }, { "epoch": 11.52, "learning_rate": 1.976965845909452e-05, "loss": 5.2849, "step": 14500 }, { "epoch": 11.91, "learning_rate": 1.976171564733916e-05, "loss": 5.2439, "step": 15000 }, { "epoch": 12.0, "eval_loss": 5.062605857849121, "eval_runtime": 66.7633, "eval_samples_per_second": 6433.547, "eval_steps_per_second": 2.097, "step": 15108 }, { "epoch": 12.31, "learning_rate": 1.9753772835583797e-05, "loss": 5.2041, "step": 15500 }, { "epoch": 12.71, "learning_rate": 1.9745830023828435e-05, "loss": 5.1685, "step": 16000 }, { "epoch": 13.0, "eval_loss": 4.951611518859863, "eval_runtime": 67.703, "eval_samples_per_second": 6344.256, "eval_steps_per_second": 2.068, "step": 16367 }, { "epoch": 13.11, "learning_rate": 1.9737887212073076e-05, "loss": 5.1323, "step": 16500 }, { "epoch": 13.5, "learning_rate": 1.9729944400317714e-05, "loss": 5.093, "step": 17000 }, { "epoch": 13.9, "learning_rate": 1.9722001588562352e-05, "loss": 5.0551, "step": 17500 }, { "epoch": 14.0, "eval_loss": 4.854883193969727, "eval_runtime": 66.6895, "eval_samples_per_second": 6440.672, "eval_steps_per_second": 2.099, "step": 17626 }, { "epoch": 14.3, "learning_rate": 1.971405877680699e-05, "loss": 5.0229, "step": 18000 }, { "epoch": 14.69, "learning_rate": 1.9706115965051632e-05, "loss": 4.9884, "step": 18500 }, { "epoch": 15.0, "eval_loss": 4.770427227020264, "eval_runtime": 68.6298, "eval_samples_per_second": 6258.579, "eval_steps_per_second": 2.04, "step": 18885 }, { "epoch": 15.09, "learning_rate": 1.969817315329627e-05, "loss": 4.9593, "step": 19000 }, { "epoch": 15.49, "learning_rate": 1.9690230341540908e-05, "loss": 4.9279, "step": 19500 }, { "epoch": 15.89, "learning_rate": 1.9682287529785546e-05, "loss": 4.9021, "step": 20000 }, { "epoch": 16.0, "eval_loss": 4.701778888702393, "eval_runtime": 68.9532, "eval_samples_per_second": 6229.223, "eval_steps_per_second": 2.03, "step": 20144 }, { "epoch": 16.28, "learning_rate": 1.9674344718030184e-05, "loss": 4.874, "step": 20500 }, { "epoch": 16.68, "learning_rate": 1.9666401906274823e-05, "loss": 4.8472, "step": 21000 }, { "epoch": 17.0, "eval_loss": 4.633734226226807, "eval_runtime": 69.3844, "eval_samples_per_second": 6190.508, "eval_steps_per_second": 2.018, "step": 21403 }, { "epoch": 17.08, "learning_rate": 1.965845909451946e-05, "loss": 4.8209, "step": 21500 }, { "epoch": 17.47, "learning_rate": 1.96505162827641e-05, "loss": 4.7976, "step": 22000 }, { "epoch": 17.87, "learning_rate": 1.964257347100874e-05, "loss": 4.7743, "step": 22500 }, { "epoch": 18.0, "eval_loss": 4.567419052124023, "eval_runtime": 66.7695, "eval_samples_per_second": 6432.952, "eval_steps_per_second": 2.097, "step": 22662 }, { "epoch": 18.27, "learning_rate": 1.963463065925338e-05, "loss": 4.7496, "step": 23000 }, { "epoch": 18.67, "learning_rate": 1.9626687847498017e-05, "loss": 4.7257, "step": 23500 }, { "epoch": 19.0, "eval_loss": 4.51430606842041, "eval_runtime": 67.2204, "eval_samples_per_second": 6389.805, "eval_steps_per_second": 2.083, "step": 23921 }, { "epoch": 19.06, "learning_rate": 1.9618745035742655e-05, "loss": 4.7034, "step": 24000 }, { "epoch": 19.46, "learning_rate": 1.9610802223987293e-05, "loss": 4.682, "step": 24500 }, { "epoch": 19.86, "learning_rate": 1.960285941223193e-05, "loss": 4.6581, "step": 25000 }, { "epoch": 20.0, "eval_loss": 4.458188056945801, "eval_runtime": 68.773, "eval_samples_per_second": 6245.549, "eval_steps_per_second": 2.036, "step": 25180 }, { "epoch": 20.25, "learning_rate": 1.959491660047657e-05, "loss": 4.6403, "step": 25500 }, { "epoch": 20.65, "learning_rate": 1.9586973788721207e-05, "loss": 4.6216, "step": 26000 }, { "epoch": 21.0, "eval_loss": 4.413742542266846, "eval_runtime": 71.3546, "eval_samples_per_second": 6019.58, "eval_steps_per_second": 1.962, "step": 26439 }, { "epoch": 21.05, "learning_rate": 1.957903097696585e-05, "loss": 4.6003, "step": 26500 }, { "epoch": 21.45, "learning_rate": 1.9571088165210487e-05, "loss": 4.5839, "step": 27000 }, { "epoch": 21.84, "learning_rate": 1.9563145353455125e-05, "loss": 4.5649, "step": 27500 }, { "epoch": 22.0, "eval_loss": 4.363780498504639, "eval_runtime": 67.376, "eval_samples_per_second": 6375.047, "eval_steps_per_second": 2.078, "step": 27698 }, { "epoch": 22.24, "learning_rate": 1.9555202541699763e-05, "loss": 4.5453, "step": 28000 }, { "epoch": 22.64, "learning_rate": 1.95472597299444e-05, "loss": 4.5306, "step": 28500 }, { "epoch": 23.0, "eval_loss": 4.31637716293335, "eval_runtime": 69.8009, "eval_samples_per_second": 6153.576, "eval_steps_per_second": 2.006, "step": 28957 }, { "epoch": 23.03, "learning_rate": 1.953931691818904e-05, "loss": 4.5124, "step": 29000 }, { "epoch": 23.43, "learning_rate": 1.9531374106433677e-05, "loss": 4.4941, "step": 29500 }, { "epoch": 23.83, "learning_rate": 1.9523431294678315e-05, "loss": 4.4793, "step": 30000 }, { "epoch": 24.0, "eval_loss": 4.280160427093506, "eval_runtime": 66.8826, "eval_samples_per_second": 6422.07, "eval_steps_per_second": 2.093, "step": 30216 }, { "epoch": 24.23, "learning_rate": 1.9515488482922957e-05, "loss": 4.4636, "step": 30500 }, { "epoch": 24.62, "learning_rate": 1.9507545671167595e-05, "loss": 4.4485, "step": 31000 }, { "epoch": 25.0, "eval_loss": 4.23799467086792, "eval_runtime": 69.4349, "eval_samples_per_second": 6186.012, "eval_steps_per_second": 2.016, "step": 31475 }, { "epoch": 25.02, "learning_rate": 1.9499602859412233e-05, "loss": 4.4339, "step": 31500 }, { "epoch": 25.42, "learning_rate": 1.9491660047656875e-05, "loss": 4.4187, "step": 32000 }, { "epoch": 25.81, "learning_rate": 1.9483717235901513e-05, "loss": 4.4025, "step": 32500 }, { "epoch": 26.0, "eval_loss": 4.209211826324463, "eval_runtime": 67.4361, "eval_samples_per_second": 6369.361, "eval_steps_per_second": 2.076, "step": 32734 }, { "epoch": 26.21, "learning_rate": 1.947577442414615e-05, "loss": 4.3873, "step": 33000 }, { "epoch": 26.61, "learning_rate": 1.946783161239079e-05, "loss": 4.3753, "step": 33500 }, { "epoch": 27.0, "eval_loss": 4.171086311340332, "eval_runtime": 66.8329, "eval_samples_per_second": 6426.851, "eval_steps_per_second": 2.095, "step": 33993 }, { "epoch": 27.01, "learning_rate": 1.9459888800635427e-05, "loss": 4.3613, "step": 34000 }, { "epoch": 27.4, "learning_rate": 1.9451945988880065e-05, "loss": 4.3455, "step": 34500 }, { "epoch": 27.8, "learning_rate": 1.9444003177124703e-05, "loss": 4.3334, "step": 35000 }, { "epoch": 28.0, "eval_loss": 4.136817932128906, "eval_runtime": 67.501, "eval_samples_per_second": 6363.243, "eval_steps_per_second": 2.074, "step": 35252 }, { "epoch": 28.2, "learning_rate": 1.943606036536934e-05, "loss": 4.321, "step": 35500 }, { "epoch": 28.59, "learning_rate": 1.942811755361398e-05, "loss": 4.3075, "step": 36000 }, { "epoch": 28.99, "learning_rate": 1.942017474185862e-05, "loss": 4.2973, "step": 36500 }, { "epoch": 29.0, "eval_loss": 4.108701229095459, "eval_runtime": 67.3525, "eval_samples_per_second": 6377.264, "eval_steps_per_second": 2.079, "step": 36511 }, { "epoch": 29.39, "learning_rate": 1.941223193010326e-05, "loss": 4.2808, "step": 37000 }, { "epoch": 29.79, "learning_rate": 1.9404289118347897e-05, "loss": 4.2712, "step": 37500 }, { "epoch": 30.0, "eval_loss": 4.082311630249023, "eval_runtime": 67.0067, "eval_samples_per_second": 6410.184, "eval_steps_per_second": 2.089, "step": 37770 }, { "epoch": 30.18, "learning_rate": 1.9396346306592535e-05, "loss": 4.2596, "step": 38000 }, { "epoch": 30.58, "learning_rate": 1.9388403494837173e-05, "loss": 4.2507, "step": 38500 }, { "epoch": 30.98, "learning_rate": 1.938046068308181e-05, "loss": 4.2366, "step": 39000 }, { "epoch": 31.0, "eval_loss": 4.0523858070373535, "eval_runtime": 65.7939, "eval_samples_per_second": 6528.345, "eval_steps_per_second": 2.128, "step": 39029 }, { "epoch": 31.37, "learning_rate": 1.937251787132645e-05, "loss": 4.2237, "step": 39500 }, { "epoch": 31.77, "learning_rate": 1.9364575059571088e-05, "loss": 4.2161, "step": 40000 }, { "epoch": 32.0, "eval_loss": 4.022578239440918, "eval_runtime": 66.3694, "eval_samples_per_second": 6471.732, "eval_steps_per_second": 2.109, "step": 40288 }, { "epoch": 32.17, "learning_rate": 1.935663224781573e-05, "loss": 4.2023, "step": 40500 }, { "epoch": 32.57, "learning_rate": 1.9348689436060367e-05, "loss": 4.1933, "step": 41000 }, { "epoch": 32.96, "learning_rate": 1.9340746624305005e-05, "loss": 4.1827, "step": 41500 }, { "epoch": 33.0, "eval_loss": 4.000241279602051, "eval_runtime": 67.2972, "eval_samples_per_second": 6382.51, "eval_steps_per_second": 2.08, "step": 41547 }, { "epoch": 33.36, "learning_rate": 1.9332803812549644e-05, "loss": 4.1727, "step": 42000 }, { "epoch": 33.76, "learning_rate": 1.9324861000794285e-05, "loss": 4.1611, "step": 42500 }, { "epoch": 34.0, "eval_loss": 3.9723849296569824, "eval_runtime": 67.3926, "eval_samples_per_second": 6373.47, "eval_steps_per_second": 2.077, "step": 42806 }, { "epoch": 34.15, "learning_rate": 1.9316918189038923e-05, "loss": 4.1554, "step": 43000 }, { "epoch": 34.55, "learning_rate": 1.930897537728356e-05, "loss": 4.1419, "step": 43500 }, { "epoch": 34.95, "learning_rate": 1.93010325655282e-05, "loss": 4.135, "step": 44000 }, { "epoch": 35.0, "eval_loss": 3.9536895751953125, "eval_runtime": 67.1765, "eval_samples_per_second": 6393.973, "eval_steps_per_second": 2.084, "step": 44065 }, { "epoch": 35.35, "learning_rate": 1.9293089753772837e-05, "loss": 4.1238, "step": 44500 }, { "epoch": 35.74, "learning_rate": 1.9285146942017476e-05, "loss": 4.1139, "step": 45000 }, { "epoch": 36.0, "eval_loss": 3.9246768951416016, "eval_runtime": 67.7772, "eval_samples_per_second": 6337.307, "eval_steps_per_second": 2.066, "step": 45324 }, { "epoch": 36.14, "learning_rate": 1.9277204130262114e-05, "loss": 4.1083, "step": 45500 }, { "epoch": 36.54, "learning_rate": 1.9269261318506752e-05, "loss": 4.0977, "step": 46000 }, { "epoch": 36.93, "learning_rate": 1.9261318506751393e-05, "loss": 4.0906, "step": 46500 }, { "epoch": 37.0, "eval_loss": 3.9067018032073975, "eval_runtime": 67.3722, "eval_samples_per_second": 6375.408, "eval_steps_per_second": 2.078, "step": 46583 }, { "epoch": 37.33, "learning_rate": 1.925337569499603e-05, "loss": 4.082, "step": 47000 }, { "epoch": 37.73, "learning_rate": 1.924543288324067e-05, "loss": 4.0735, "step": 47500 }, { "epoch": 38.0, "eval_loss": 3.885633945465088, "eval_runtime": 67.1041, "eval_samples_per_second": 6400.875, "eval_steps_per_second": 2.086, "step": 47842 }, { "epoch": 38.13, "learning_rate": 1.9237490071485308e-05, "loss": 4.0647, "step": 48000 }, { "epoch": 38.52, "learning_rate": 1.9229547259729946e-05, "loss": 4.0549, "step": 48500 }, { "epoch": 38.92, "learning_rate": 1.9221604447974584e-05, "loss": 4.0479, "step": 49000 }, { "epoch": 39.0, "eval_loss": 3.8615520000457764, "eval_runtime": 68.4352, "eval_samples_per_second": 6276.377, "eval_steps_per_second": 2.046, "step": 49101 }, { "epoch": 39.32, "learning_rate": 1.9213661636219222e-05, "loss": 4.0404, "step": 49500 }, { "epoch": 39.71, "learning_rate": 1.920571882446386e-05, "loss": 4.0308, "step": 50000 }, { "epoch": 40.0, "eval_loss": 3.8436472415924072, "eval_runtime": 66.2606, "eval_samples_per_second": 6482.356, "eval_steps_per_second": 2.113, "step": 50360 }, { "epoch": 40.11, "learning_rate": 1.91977760127085e-05, "loss": 4.0236, "step": 50500 }, { "epoch": 40.51, "learning_rate": 1.918983320095314e-05, "loss": 4.0152, "step": 51000 }, { "epoch": 40.91, "learning_rate": 1.9181890389197778e-05, "loss": 4.0092, "step": 51500 }, { "epoch": 41.0, "eval_loss": 3.8258142471313477, "eval_runtime": 66.2712, "eval_samples_per_second": 6481.319, "eval_steps_per_second": 2.113, "step": 51619 }, { "epoch": 41.3, "learning_rate": 1.9173947577442416e-05, "loss": 3.9995, "step": 52000 }, { "epoch": 41.7, "learning_rate": 1.9166004765687054e-05, "loss": 3.9931, "step": 52500 }, { "epoch": 42.0, "eval_loss": 3.807096242904663, "eval_runtime": 67.409, "eval_samples_per_second": 6371.923, "eval_steps_per_second": 2.077, "step": 52878 }, { "epoch": 42.1, "learning_rate": 1.9158061953931692e-05, "loss": 3.9844, "step": 53000 }, { "epoch": 42.49, "learning_rate": 1.915011914217633e-05, "loss": 3.9766, "step": 53500 }, { "epoch": 42.89, "learning_rate": 1.914217633042097e-05, "loss": 3.9698, "step": 54000 }, { "epoch": 43.0, "eval_loss": 3.7926580905914307, "eval_runtime": 65.7675, "eval_samples_per_second": 6530.958, "eval_steps_per_second": 2.129, "step": 54137 }, { "epoch": 43.29, "learning_rate": 1.913423351866561e-05, "loss": 3.9651, "step": 54500 }, { "epoch": 43.69, "learning_rate": 1.9126290706910248e-05, "loss": 3.9553, "step": 55000 }, { "epoch": 44.0, "eval_loss": 3.772336483001709, "eval_runtime": 68.2823, "eval_samples_per_second": 6290.431, "eval_steps_per_second": 2.05, "step": 55396 }, { "epoch": 44.08, "learning_rate": 1.9118347895154886e-05, "loss": 3.9494, "step": 55500 }, { "epoch": 44.48, "learning_rate": 1.9110405083399524e-05, "loss": 3.9438, "step": 56000 }, { "epoch": 44.88, "learning_rate": 1.9102462271644166e-05, "loss": 3.9386, "step": 56500 }, { "epoch": 45.0, "eval_loss": 3.760333776473999, "eval_runtime": 66.084, "eval_samples_per_second": 6499.685, "eval_steps_per_second": 2.119, "step": 56655 }, { "epoch": 45.27, "learning_rate": 1.9094519459888804e-05, "loss": 3.9294, "step": 57000 }, { "epoch": 45.67, "learning_rate": 1.9086576648133442e-05, "loss": 3.9237, "step": 57500 }, { "epoch": 46.0, "eval_loss": 3.74114990234375, "eval_runtime": 66.5778, "eval_samples_per_second": 6451.476, "eval_steps_per_second": 2.103, "step": 57914 }, { "epoch": 46.07, "learning_rate": 1.907863383637808e-05, "loss": 3.919, "step": 58000 }, { "epoch": 46.47, "learning_rate": 1.9070691024622718e-05, "loss": 3.9105, "step": 58500 }, { "epoch": 46.86, "learning_rate": 1.9062748212867356e-05, "loss": 3.9029, "step": 59000 }, { "epoch": 47.0, "eval_loss": 3.7245988845825195, "eval_runtime": 66.6555, "eval_samples_per_second": 6443.956, "eval_steps_per_second": 2.1, "step": 59173 }, { "epoch": 47.26, "learning_rate": 1.9054805401111994e-05, "loss": 3.8969, "step": 59500 }, { "epoch": 47.66, "learning_rate": 1.9046862589356632e-05, "loss": 3.8921, "step": 60000 }, { "epoch": 48.0, "eval_loss": 3.709540843963623, "eval_runtime": 68.5212, "eval_samples_per_second": 6268.501, "eval_steps_per_second": 2.043, "step": 60432 }, { "epoch": 48.05, "learning_rate": 1.9038919777601274e-05, "loss": 3.8875, "step": 60500 }, { "epoch": 48.45, "learning_rate": 1.9030976965845912e-05, "loss": 3.88, "step": 61000 }, { "epoch": 48.85, "learning_rate": 1.902303415409055e-05, "loss": 3.8755, "step": 61500 }, { "epoch": 49.0, "eval_loss": 3.692728281021118, "eval_runtime": 68.9938, "eval_samples_per_second": 6225.561, "eval_steps_per_second": 2.029, "step": 61691 }, { "epoch": 49.25, "learning_rate": 1.9015091342335188e-05, "loss": 3.8679, "step": 62000 }, { "epoch": 49.64, "learning_rate": 1.9007148530579826e-05, "loss": 3.863, "step": 62500 }, { "epoch": 50.0, "eval_loss": 3.687544345855713, "eval_runtime": 67.1467, "eval_samples_per_second": 6396.812, "eval_steps_per_second": 2.085, "step": 62950 }, { "epoch": 50.04, "learning_rate": 1.8999205718824465e-05, "loss": 3.857, "step": 63000 }, { "epoch": 50.44, "learning_rate": 1.8991262907069103e-05, "loss": 3.8495, "step": 63500 }, { "epoch": 50.83, "learning_rate": 1.898332009531374e-05, "loss": 3.8465, "step": 64000 }, { "epoch": 51.0, "eval_loss": 3.668276071548462, "eval_runtime": 66.8777, "eval_samples_per_second": 6422.543, "eval_steps_per_second": 2.093, "step": 64209 }, { "epoch": 51.23, "learning_rate": 1.897537728355838e-05, "loss": 3.8387, "step": 64500 }, { "epoch": 51.63, "learning_rate": 1.896743447180302e-05, "loss": 3.8317, "step": 65000 }, { "epoch": 52.0, "eval_loss": 3.653848171234131, "eval_runtime": 66.6198, "eval_samples_per_second": 6447.404, "eval_steps_per_second": 2.101, "step": 65468 }, { "epoch": 52.03, "learning_rate": 1.895949166004766e-05, "loss": 3.8305, "step": 65500 }, { "epoch": 52.42, "learning_rate": 1.8951548848292297e-05, "loss": 3.8253, "step": 66000 }, { "epoch": 52.82, "learning_rate": 1.8943606036536935e-05, "loss": 3.8187, "step": 66500 }, { "epoch": 53.0, "eval_loss": 3.6389107704162598, "eval_runtime": 66.3452, "eval_samples_per_second": 6474.092, "eval_steps_per_second": 2.11, "step": 66727 }, { "epoch": 53.22, "learning_rate": 1.8935663224781576e-05, "loss": 3.8167, "step": 67000 }, { "epoch": 53.61, "learning_rate": 1.8927720413026214e-05, "loss": 3.8085, "step": 67500 }, { "epoch": 54.0, "eval_loss": 3.6302101612091064, "eval_runtime": 66.6228, "eval_samples_per_second": 6447.12, "eval_steps_per_second": 2.101, "step": 67986 }, { "epoch": 54.01, "learning_rate": 1.8919777601270852e-05, "loss": 3.8058, "step": 68000 }, { "epoch": 54.41, "learning_rate": 1.891183478951549e-05, "loss": 3.7988, "step": 68500 }, { "epoch": 54.81, "learning_rate": 1.890389197776013e-05, "loss": 3.7938, "step": 69000 }, { "epoch": 55.0, "eval_loss": 3.6164495944976807, "eval_runtime": 67.1953, "eval_samples_per_second": 6392.185, "eval_steps_per_second": 2.083, "step": 69245 }, { "epoch": 55.2, "learning_rate": 1.8895949166004767e-05, "loss": 3.7869, "step": 69500 }, { "epoch": 55.6, "learning_rate": 1.8888006354249405e-05, "loss": 3.784, "step": 70000 }, { "epoch": 56.0, "learning_rate": 1.8880063542494046e-05, "loss": 3.7802, "step": 70500 }, { "epoch": 56.0, "eval_loss": 3.6089677810668945, "eval_runtime": 67.3655, "eval_samples_per_second": 6376.034, "eval_steps_per_second": 2.078, "step": 70504 }, { "epoch": 56.39, "learning_rate": 1.8872120730738684e-05, "loss": 3.7744, "step": 71000 }, { "epoch": 56.79, "learning_rate": 1.8864177918983323e-05, "loss": 3.7681, "step": 71500 }, { "epoch": 57.0, "eval_loss": 3.594083547592163, "eval_runtime": 68.8765, "eval_samples_per_second": 6236.163, "eval_steps_per_second": 2.033, "step": 71763 }, { "epoch": 57.19, "learning_rate": 1.885623510722796e-05, "loss": 3.7626, "step": 72000 }, { "epoch": 57.59, "learning_rate": 1.88482922954726e-05, "loss": 3.7606, "step": 72500 }, { "epoch": 57.98, "learning_rate": 1.8840349483717237e-05, "loss": 3.7561, "step": 73000 }, { "epoch": 58.0, "eval_loss": 3.5827672481536865, "eval_runtime": 67.9355, "eval_samples_per_second": 6322.538, "eval_steps_per_second": 2.061, "step": 73022 }, { "epoch": 58.38, "learning_rate": 1.8832406671961875e-05, "loss": 3.7499, "step": 73500 }, { "epoch": 58.78, "learning_rate": 1.8824463860206513e-05, "loss": 3.7484, "step": 74000 }, { "epoch": 59.0, "eval_loss": 3.573406934738159, "eval_runtime": 67.2384, "eval_samples_per_second": 6388.088, "eval_steps_per_second": 2.082, "step": 74281 }, { "epoch": 59.17, "learning_rate": 1.881652104845115e-05, "loss": 3.7422, "step": 74500 }, { "epoch": 59.57, "learning_rate": 1.8808578236695793e-05, "loss": 3.7379, "step": 75000 }, { "epoch": 59.97, "learning_rate": 1.880063542494043e-05, "loss": 3.7324, "step": 75500 }, { "epoch": 60.0, "eval_loss": 3.5540149211883545, "eval_runtime": 69.3846, "eval_samples_per_second": 6190.499, "eval_steps_per_second": 2.018, "step": 75540 }, { "epoch": 60.37, "learning_rate": 1.879269261318507e-05, "loss": 3.7268, "step": 76000 }, { "epoch": 60.76, "learning_rate": 1.8784749801429707e-05, "loss": 3.7267, "step": 76500 }, { "epoch": 61.0, "eval_loss": 3.553678274154663, "eval_runtime": 67.899, "eval_samples_per_second": 6325.937, "eval_steps_per_second": 2.062, "step": 76799 }, { "epoch": 61.16, "learning_rate": 1.8776806989674345e-05, "loss": 3.7195, "step": 77000 }, { "epoch": 61.56, "learning_rate": 1.8768864177918983e-05, "loss": 3.7172, "step": 77500 }, { "epoch": 61.95, "learning_rate": 1.876092136616362e-05, "loss": 3.7124, "step": 78000 }, { "epoch": 62.0, "eval_loss": 3.5368738174438477, "eval_runtime": 66.6148, "eval_samples_per_second": 6447.895, "eval_steps_per_second": 2.102, "step": 78058 }, { "epoch": 62.35, "learning_rate": 1.875297855440826e-05, "loss": 3.7068, "step": 78500 }, { "epoch": 62.75, "learning_rate": 1.87450357426529e-05, "loss": 3.7038, "step": 79000 }, { "epoch": 63.0, "eval_loss": 3.527386426925659, "eval_runtime": 66.2228, "eval_samples_per_second": 6486.06, "eval_steps_per_second": 2.114, "step": 79317 }, { "epoch": 63.15, "learning_rate": 1.873709293089754e-05, "loss": 3.6984, "step": 79500 }, { "epoch": 63.54, "learning_rate": 1.8729150119142177e-05, "loss": 3.6969, "step": 80000 }, { "epoch": 63.94, "learning_rate": 1.872120730738682e-05, "loss": 3.6908, "step": 80500 }, { "epoch": 64.0, "eval_loss": 3.5240414142608643, "eval_runtime": 68.4559, "eval_samples_per_second": 6274.479, "eval_steps_per_second": 2.045, "step": 80576 }, { "epoch": 64.34, "learning_rate": 1.8713264495631457e-05, "loss": 3.6889, "step": 81000 }, { "epoch": 64.73, "learning_rate": 1.8705321683876095e-05, "loss": 3.6819, "step": 81500 }, { "epoch": 65.0, "eval_loss": 3.5158050060272217, "eval_runtime": 67.0353, "eval_samples_per_second": 6407.448, "eval_steps_per_second": 2.088, "step": 81835 }, { "epoch": 65.13, "learning_rate": 1.8697378872120733e-05, "loss": 3.6795, "step": 82000 }, { "epoch": 65.53, "learning_rate": 1.868943606036537e-05, "loss": 3.6755, "step": 82500 }, { "epoch": 65.93, "learning_rate": 1.868149324861001e-05, "loss": 3.6729, "step": 83000 }, { "epoch": 66.0, "eval_loss": 3.5025932788848877, "eval_runtime": 68.4788, "eval_samples_per_second": 6272.38, "eval_steps_per_second": 2.044, "step": 83094 }, { "epoch": 66.32, "learning_rate": 1.8673550436854647e-05, "loss": 3.6646, "step": 83500 }, { "epoch": 66.72, "learning_rate": 1.8665607625099285e-05, "loss": 3.6644, "step": 84000 }, { "epoch": 67.0, "eval_loss": 3.4984912872314453, "eval_runtime": 67.9437, "eval_samples_per_second": 6321.778, "eval_steps_per_second": 2.061, "step": 84353 }, { "epoch": 67.12, "learning_rate": 1.8657664813343927e-05, "loss": 3.6597, "step": 84500 }, { "epoch": 67.51, "learning_rate": 1.8649722001588565e-05, "loss": 3.6562, "step": 85000 }, { "epoch": 67.91, "learning_rate": 1.8641779189833203e-05, "loss": 3.6521, "step": 85500 }, { "epoch": 68.0, "eval_loss": 3.4852287769317627, "eval_runtime": 66.4225, "eval_samples_per_second": 6466.557, "eval_steps_per_second": 2.108, "step": 85612 }, { "epoch": 68.31, "learning_rate": 1.863383637807784e-05, "loss": 3.6498, "step": 86000 }, { "epoch": 68.71, "learning_rate": 1.862589356632248e-05, "loss": 3.645, "step": 86500 }, { "epoch": 69.0, "eval_loss": 3.47290301322937, "eval_runtime": 66.8801, "eval_samples_per_second": 6422.318, "eval_steps_per_second": 2.093, "step": 86871 }, { "epoch": 69.1, "learning_rate": 1.8617950754567118e-05, "loss": 3.6473, "step": 87000 }, { "epoch": 69.5, "learning_rate": 1.8610007942811756e-05, "loss": 3.6397, "step": 87500 }, { "epoch": 69.9, "learning_rate": 1.8602065131056394e-05, "loss": 3.6368, "step": 88000 }, { "epoch": 70.0, "eval_loss": 3.466801881790161, "eval_runtime": 67.0136, "eval_samples_per_second": 6409.524, "eval_steps_per_second": 2.089, "step": 88130 }, { "epoch": 70.29, "learning_rate": 1.8594122319301032e-05, "loss": 3.6326, "step": 88500 }, { "epoch": 70.69, "learning_rate": 1.8586179507545673e-05, "loss": 3.6293, "step": 89000 }, { "epoch": 71.0, "eval_loss": 3.4586901664733887, "eval_runtime": 70.8439, "eval_samples_per_second": 6062.982, "eval_steps_per_second": 1.976, "step": 89389 }, { "epoch": 71.09, "learning_rate": 1.857823669579031e-05, "loss": 3.6278, "step": 89500 }, { "epoch": 71.49, "learning_rate": 1.857029388403495e-05, "loss": 3.6235, "step": 90000 }, { "epoch": 71.88, "learning_rate": 1.8562351072279588e-05, "loss": 3.6188, "step": 90500 }, { "epoch": 72.0, "eval_loss": 3.448523998260498, "eval_runtime": 67.5981, "eval_samples_per_second": 6354.094, "eval_steps_per_second": 2.071, "step": 90648 }, { "epoch": 72.28, "learning_rate": 1.855440826052423e-05, "loss": 3.6147, "step": 91000 }, { "epoch": 72.68, "learning_rate": 1.8546465448768867e-05, "loss": 3.6128, "step": 91500 }, { "epoch": 73.0, "eval_loss": 3.4464056491851807, "eval_runtime": 68.3905, "eval_samples_per_second": 6280.476, "eval_steps_per_second": 2.047, "step": 91907 }, { "epoch": 73.07, "learning_rate": 1.8538522637013505e-05, "loss": 3.6088, "step": 92000 }, { "epoch": 73.47, "learning_rate": 1.8530579825258144e-05, "loss": 3.6079, "step": 92500 }, { "epoch": 73.87, "learning_rate": 1.852263701350278e-05, "loss": 3.6036, "step": 93000 }, { "epoch": 74.0, "eval_loss": 3.434415817260742, "eval_runtime": 66.6353, "eval_samples_per_second": 6445.911, "eval_steps_per_second": 2.101, "step": 93166 }, { "epoch": 74.27, "learning_rate": 1.851469420174742e-05, "loss": 3.5981, "step": 93500 }, { "epoch": 74.66, "learning_rate": 1.8506751389992058e-05, "loss": 3.5946, "step": 94000 }, { "epoch": 75.0, "eval_loss": 3.4281129837036133, "eval_runtime": 69.3581, "eval_samples_per_second": 6192.858, "eval_steps_per_second": 2.019, "step": 94425 }, { "epoch": 75.06, "learning_rate": 1.84988085782367e-05, "loss": 3.5964, "step": 94500 }, { "epoch": 75.46, "learning_rate": 1.8490865766481337e-05, "loss": 3.5889, "step": 95000 }, { "epoch": 75.85, "learning_rate": 1.8482922954725976e-05, "loss": 3.5867, "step": 95500 }, { "epoch": 76.0, "eval_loss": 3.4131107330322266, "eval_runtime": 67.6941, "eval_samples_per_second": 6345.09, "eval_steps_per_second": 2.068, "step": 95684 }, { "epoch": 76.25, "learning_rate": 1.8474980142970614e-05, "loss": 3.5854, "step": 96000 }, { "epoch": 76.65, "learning_rate": 1.8467037331215252e-05, "loss": 3.5765, "step": 96500 }, { "epoch": 77.0, "eval_loss": 3.4167041778564453, "eval_runtime": 66.4948, "eval_samples_per_second": 6459.532, "eval_steps_per_second": 2.105, "step": 96943 }, { "epoch": 77.05, "learning_rate": 1.845909451945989e-05, "loss": 3.5785, "step": 97000 }, { "epoch": 77.44, "learning_rate": 1.8451151707704528e-05, "loss": 3.5731, "step": 97500 }, { "epoch": 77.84, "learning_rate": 1.8443208895949166e-05, "loss": 3.5715, "step": 98000 }, { "epoch": 78.0, "eval_loss": 3.4056899547576904, "eval_runtime": 66.6161, "eval_samples_per_second": 6447.77, "eval_steps_per_second": 2.102, "step": 98202 }, { "epoch": 78.24, "learning_rate": 1.8435266084193804e-05, "loss": 3.5657, "step": 98500 }, { "epoch": 78.63, "learning_rate": 1.8427323272438446e-05, "loss": 3.5628, "step": 99000 }, { "epoch": 79.0, "eval_loss": 3.398467779159546, "eval_runtime": 66.3825, "eval_samples_per_second": 6470.453, "eval_steps_per_second": 2.109, "step": 99461 }, { "epoch": 79.03, "learning_rate": 1.8419380460683084e-05, "loss": 3.5621, "step": 99500 }, { "epoch": 79.43, "learning_rate": 1.8411437648927722e-05, "loss": 3.5593, "step": 100000 }, { "epoch": 79.83, "learning_rate": 1.840349483717236e-05, "loss": 3.5563, "step": 100500 }, { "epoch": 80.0, "eval_loss": 3.389631748199463, "eval_runtime": 66.7852, "eval_samples_per_second": 6431.444, "eval_steps_per_second": 2.096, "step": 100720 }, { "epoch": 80.22, "learning_rate": 1.8395552025416998e-05, "loss": 3.5534, "step": 101000 }, { "epoch": 80.62, "learning_rate": 1.8387609213661636e-05, "loss": 3.5528, "step": 101500 }, { "epoch": 81.0, "eval_loss": 3.383220672607422, "eval_runtime": 66.6428, "eval_samples_per_second": 6445.181, "eval_steps_per_second": 2.101, "step": 101979 }, { "epoch": 81.02, "learning_rate": 1.8379666401906274e-05, "loss": 3.5483, "step": 102000 }, { "epoch": 81.41, "learning_rate": 1.8371723590150913e-05, "loss": 3.5455, "step": 102500 }, { "epoch": 81.81, "learning_rate": 1.8363780778395554e-05, "loss": 3.5418, "step": 103000 }, { "epoch": 82.0, "eval_loss": 3.3751513957977295, "eval_runtime": 67.3609, "eval_samples_per_second": 6376.469, "eval_steps_per_second": 2.078, "step": 103238 }, { "epoch": 82.21, "learning_rate": 1.8355837966640192e-05, "loss": 3.5402, "step": 103500 }, { "epoch": 82.61, "learning_rate": 1.834789515488483e-05, "loss": 3.5387, "step": 104000 }, { "epoch": 83.0, "eval_loss": 3.369637966156006, "eval_runtime": 67.708, "eval_samples_per_second": 6343.782, "eval_steps_per_second": 2.068, "step": 104497 }, { "epoch": 83.0, "learning_rate": 1.8339952343129472e-05, "loss": 3.5316, "step": 104500 }, { "epoch": 83.4, "learning_rate": 1.833200953137411e-05, "loss": 3.5324, "step": 105000 }, { "epoch": 83.8, "learning_rate": 1.8324066719618748e-05, "loss": 3.5288, "step": 105500 }, { "epoch": 84.0, "eval_loss": 3.3638176918029785, "eval_runtime": 66.3944, "eval_samples_per_second": 6469.3, "eval_steps_per_second": 2.109, "step": 105756 }, { "epoch": 84.19, "learning_rate": 1.8316123907863386e-05, "loss": 3.5262, "step": 106000 }, { "epoch": 84.59, "learning_rate": 1.8308181096108024e-05, "loss": 3.5254, "step": 106500 }, { "epoch": 84.99, "learning_rate": 1.8300238284352662e-05, "loss": 3.5219, "step": 107000 }, { "epoch": 85.0, "eval_loss": 3.3570709228515625, "eval_runtime": 68.3924, "eval_samples_per_second": 6280.3, "eval_steps_per_second": 2.047, "step": 107015 }, { "epoch": 85.39, "learning_rate": 1.82922954725973e-05, "loss": 3.5185, "step": 107500 }, { "epoch": 85.78, "learning_rate": 1.828435266084194e-05, "loss": 3.5167, "step": 108000 }, { "epoch": 86.0, "eval_loss": 3.354255199432373, "eval_runtime": 67.3395, "eval_samples_per_second": 6378.504, "eval_steps_per_second": 2.079, "step": 108274 }, { "epoch": 86.18, "learning_rate": 1.8276409849086577e-05, "loss": 3.5139, "step": 108500 }, { "epoch": 86.58, "learning_rate": 1.8268467037331218e-05, "loss": 3.5113, "step": 109000 }, { "epoch": 86.97, "learning_rate": 1.8260524225575856e-05, "loss": 3.5094, "step": 109500 }, { "epoch": 87.0, "eval_loss": 3.3403420448303223, "eval_runtime": 68.5464, "eval_samples_per_second": 6266.194, "eval_steps_per_second": 2.042, "step": 109533 }, { "epoch": 87.37, "learning_rate": 1.8252581413820494e-05, "loss": 3.5045, "step": 110000 }, { "epoch": 87.77, "learning_rate": 1.8244638602065132e-05, "loss": 3.5044, "step": 110500 }, { "epoch": 88.0, "eval_loss": 3.3406448364257812, "eval_runtime": 66.8167, "eval_samples_per_second": 6428.404, "eval_steps_per_second": 2.095, "step": 110792 }, { "epoch": 88.17, "learning_rate": 1.823669579030977e-05, "loss": 3.5018, "step": 111000 }, { "epoch": 88.56, "learning_rate": 1.822875297855441e-05, "loss": 3.4962, "step": 111500 }, { "epoch": 88.96, "learning_rate": 1.8220810166799047e-05, "loss": 3.4978, "step": 112000 }, { "epoch": 89.0, "eval_loss": 3.3344266414642334, "eval_runtime": 66.7414, "eval_samples_per_second": 6435.656, "eval_steps_per_second": 2.098, "step": 112051 }, { "epoch": 89.36, "learning_rate": 1.8212867355043685e-05, "loss": 3.4945, "step": 112500 }, { "epoch": 89.75, "learning_rate": 1.8204924543288326e-05, "loss": 3.491, "step": 113000 }, { "epoch": 90.0, "eval_loss": 3.3249945640563965, "eval_runtime": 67.2197, "eval_samples_per_second": 6389.867, "eval_steps_per_second": 2.083, "step": 113310 }, { "epoch": 90.15, "learning_rate": 1.8196981731532964e-05, "loss": 3.488, "step": 113500 }, { "epoch": 90.55, "learning_rate": 1.8189038919777603e-05, "loss": 3.4882, "step": 114000 }, { "epoch": 90.95, "learning_rate": 1.818109610802224e-05, "loss": 3.4826, "step": 114500 }, { "epoch": 91.0, "eval_loss": 3.3221962451934814, "eval_runtime": 67.3476, "eval_samples_per_second": 6377.736, "eval_steps_per_second": 2.079, "step": 114569 }, { "epoch": 91.34, "learning_rate": 1.817315329626688e-05, "loss": 3.4815, "step": 115000 }, { "epoch": 91.74, "learning_rate": 1.816521048451152e-05, "loss": 3.4794, "step": 115500 }, { "epoch": 92.0, "eval_loss": 3.315640926361084, "eval_runtime": 67.273, "eval_samples_per_second": 6384.805, "eval_steps_per_second": 2.081, "step": 115828 }, { "epoch": 92.14, "learning_rate": 1.815726767275616e-05, "loss": 3.478, "step": 116000 }, { "epoch": 92.53, "learning_rate": 1.8149324861000797e-05, "loss": 3.4725, "step": 116500 }, { "epoch": 92.93, "learning_rate": 1.8141382049245435e-05, "loss": 3.4719, "step": 117000 }, { "epoch": 93.0, "eval_loss": 3.3098437786102295, "eval_runtime": 66.4664, "eval_samples_per_second": 6462.285, "eval_steps_per_second": 2.106, "step": 117087 }, { "epoch": 93.33, "learning_rate": 1.8133439237490073e-05, "loss": 3.4709, "step": 117500 }, { "epoch": 93.73, "learning_rate": 1.812549642573471e-05, "loss": 3.4677, "step": 118000 }, { "epoch": 94.0, "eval_loss": 3.3038644790649414, "eval_runtime": 67.2697, "eval_samples_per_second": 6385.119, "eval_steps_per_second": 2.081, "step": 118346 }, { "epoch": 94.12, "learning_rate": 1.811755361397935e-05, "loss": 3.4649, "step": 118500 }, { "epoch": 94.52, "learning_rate": 1.810961080222399e-05, "loss": 3.465, "step": 119000 }, { "epoch": 94.92, "learning_rate": 1.810166799046863e-05, "loss": 3.4601, "step": 119500 }, { "epoch": 95.0, "eval_loss": 3.302288293838501, "eval_runtime": 67.3643, "eval_samples_per_second": 6376.151, "eval_steps_per_second": 2.078, "step": 119605 }, { "epoch": 95.31, "learning_rate": 1.8093725178713267e-05, "loss": 3.461, "step": 120000 }, { "epoch": 95.71, "learning_rate": 1.8085782366957905e-05, "loss": 3.4582, "step": 120500 }, { "epoch": 96.0, "eval_loss": 3.2969419956207275, "eval_runtime": 66.6251, "eval_samples_per_second": 6446.899, "eval_steps_per_second": 2.101, "step": 120864 }, { "epoch": 96.11, "learning_rate": 1.8077839555202543e-05, "loss": 3.4542, "step": 121000 }, { "epoch": 96.51, "learning_rate": 1.806989674344718e-05, "loss": 3.4533, "step": 121500 }, { "epoch": 96.9, "learning_rate": 1.806195393169182e-05, "loss": 3.4519, "step": 122000 }, { "epoch": 97.0, "eval_loss": 3.284411907196045, "eval_runtime": 66.9869, "eval_samples_per_second": 6412.079, "eval_steps_per_second": 2.09, "step": 122123 }, { "epoch": 97.3, "learning_rate": 1.8054011119936457e-05, "loss": 3.448, "step": 122500 }, { "epoch": 97.7, "learning_rate": 1.80460683081811e-05, "loss": 3.4459, "step": 123000 }, { "epoch": 98.0, "eval_loss": 3.285104274749756, "eval_runtime": 71.6433, "eval_samples_per_second": 5995.324, "eval_steps_per_second": 1.954, "step": 123382 }, { "epoch": 98.09, "learning_rate": 1.8038125496425737e-05, "loss": 3.4414, "step": 123500 }, { "epoch": 98.49, "learning_rate": 1.8030182684670375e-05, "loss": 3.4427, "step": 124000 }, { "epoch": 98.89, "learning_rate": 1.8022239872915013e-05, "loss": 3.4413, "step": 124500 }, { "epoch": 99.0, "eval_loss": 3.278201103210449, "eval_runtime": 65.6711, "eval_samples_per_second": 6540.548, "eval_steps_per_second": 2.132, "step": 124641 }, { "epoch": 99.29, "learning_rate": 1.801429706115965e-05, "loss": 3.4343, "step": 125000 }, { "epoch": 99.68, "learning_rate": 1.800635424940429e-05, "loss": 3.4345, "step": 125500 }, { "epoch": 100.0, "eval_loss": 3.270134449005127, "eval_runtime": 69.1322, "eval_samples_per_second": 6213.098, "eval_steps_per_second": 2.025, "step": 125900 }, { "epoch": 100.08, "learning_rate": 1.7998411437648927e-05, "loss": 3.433, "step": 126000 }, { "epoch": 100.48, "learning_rate": 1.7990468625893566e-05, "loss": 3.4311, "step": 126500 }, { "epoch": 100.87, "learning_rate": 1.7982525814138204e-05, "loss": 3.4259, "step": 127000 }, { "epoch": 101.0, "eval_loss": 3.2695064544677734, "eval_runtime": 66.2868, "eval_samples_per_second": 6479.794, "eval_steps_per_second": 2.112, "step": 127159 }, { "epoch": 101.27, "learning_rate": 1.7974583002382845e-05, "loss": 3.4293, "step": 127500 }, { "epoch": 101.67, "learning_rate": 1.7966640190627483e-05, "loss": 3.4243, "step": 128000 }, { "epoch": 102.0, "eval_loss": 3.2620866298675537, "eval_runtime": 66.6706, "eval_samples_per_second": 6442.493, "eval_steps_per_second": 2.1, "step": 128418 }, { "epoch": 102.07, "learning_rate": 1.7958697378872125e-05, "loss": 3.4247, "step": 128500 }, { "epoch": 102.46, "learning_rate": 1.7950754567116763e-05, "loss": 3.4206, "step": 129000 }, { "epoch": 102.86, "learning_rate": 1.79428117553614e-05, "loss": 3.4214, "step": 129500 }, { "epoch": 103.0, "eval_loss": 3.2584633827209473, "eval_runtime": 66.6392, "eval_samples_per_second": 6445.53, "eval_steps_per_second": 2.101, "step": 129677 }, { "epoch": 103.26, "learning_rate": 1.793486894360604e-05, "loss": 3.419, "step": 130000 }, { "epoch": 103.65, "learning_rate": 1.7926926131850677e-05, "loss": 3.4173, "step": 130500 }, { "epoch": 104.0, "eval_loss": 3.2492363452911377, "eval_runtime": 65.8622, "eval_samples_per_second": 6521.574, "eval_steps_per_second": 2.126, "step": 130936 }, { "epoch": 104.05, "learning_rate": 1.7918983320095315e-05, "loss": 3.4124, "step": 131000 }, { "epoch": 104.45, "learning_rate": 1.7911040508339953e-05, "loss": 3.4123, "step": 131500 }, { "epoch": 104.85, "learning_rate": 1.790309769658459e-05, "loss": 3.4107, "step": 132000 }, { "epoch": 105.0, "eval_loss": 3.245388984680176, "eval_runtime": 67.5687, "eval_samples_per_second": 6356.866, "eval_steps_per_second": 2.072, "step": 132195 }, { "epoch": 105.24, "learning_rate": 1.789515488482923e-05, "loss": 3.4099, "step": 132500 }, { "epoch": 105.64, "learning_rate": 1.788721207307387e-05, "loss": 3.4038, "step": 133000 }, { "epoch": 106.0, "eval_loss": 3.246819257736206, "eval_runtime": 66.6017, "eval_samples_per_second": 6449.163, "eval_steps_per_second": 2.102, "step": 133454 }, { "epoch": 106.04, "learning_rate": 1.787926926131851e-05, "loss": 3.401, "step": 133500 }, { "epoch": 106.43, "learning_rate": 1.7871326449563147e-05, "loss": 3.4012, "step": 134000 }, { "epoch": 106.83, "learning_rate": 1.7863383637807785e-05, "loss": 3.4018, "step": 134500 }, { "epoch": 107.0, "eval_loss": 3.2361791133880615, "eval_runtime": 66.0106, "eval_samples_per_second": 6506.912, "eval_steps_per_second": 2.121, "step": 134713 }, { "epoch": 107.23, "learning_rate": 1.7855440826052424e-05, "loss": 3.3973, "step": 135000 }, { "epoch": 107.63, "learning_rate": 1.784749801429706e-05, "loss": 3.3978, "step": 135500 }, { "epoch": 108.0, "eval_loss": 3.2379322052001953, "eval_runtime": 67.2281, "eval_samples_per_second": 6389.073, "eval_steps_per_second": 2.082, "step": 135972 }, { "epoch": 108.02, "learning_rate": 1.78395552025417e-05, "loss": 3.3909, "step": 136000 }, { "epoch": 108.42, "learning_rate": 1.7831612390786338e-05, "loss": 3.3947, "step": 136500 }, { "epoch": 108.82, "learning_rate": 1.782366957903098e-05, "loss": 3.3918, "step": 137000 }, { "epoch": 109.0, "eval_loss": 3.2277774810791016, "eval_runtime": 67.382, "eval_samples_per_second": 6374.477, "eval_steps_per_second": 2.078, "step": 137231 }, { "epoch": 109.21, "learning_rate": 1.7815726767275617e-05, "loss": 3.3907, "step": 137500 }, { "epoch": 109.61, "learning_rate": 1.7807783955520256e-05, "loss": 3.3876, "step": 138000 }, { "epoch": 110.0, "eval_loss": 3.2269318103790283, "eval_runtime": 68.2498, "eval_samples_per_second": 6293.422, "eval_steps_per_second": 2.051, "step": 138490 }, { "epoch": 110.01, "learning_rate": 1.7799841143764894e-05, "loss": 3.3852, "step": 138500 }, { "epoch": 110.41, "learning_rate": 1.7791898332009532e-05, "loss": 3.3855, "step": 139000 }, { "epoch": 110.8, "learning_rate": 1.7783955520254173e-05, "loss": 3.3827, "step": 139500 }, { "epoch": 111.0, "eval_loss": 3.2248029708862305, "eval_runtime": 66.1624, "eval_samples_per_second": 6491.984, "eval_steps_per_second": 2.116, "step": 139749 }, { "epoch": 111.2, "learning_rate": 1.777601270849881e-05, "loss": 3.3767, "step": 140000 }, { "epoch": 111.6, "learning_rate": 1.776806989674345e-05, "loss": 3.3768, "step": 140500 }, { "epoch": 111.99, "learning_rate": 1.7760127084988088e-05, "loss": 3.376, "step": 141000 }, { "epoch": 112.0, "eval_loss": 3.221318006515503, "eval_runtime": 69.426, "eval_samples_per_second": 6186.804, "eval_steps_per_second": 2.017, "step": 141008 }, { "epoch": 112.39, "learning_rate": 1.7752184273232726e-05, "loss": 3.3763, "step": 141500 }, { "epoch": 112.79, "learning_rate": 1.7744241461477364e-05, "loss": 3.3715, "step": 142000 }, { "epoch": 113.0, "eval_loss": 3.2150943279266357, "eval_runtime": 65.8541, "eval_samples_per_second": 6522.369, "eval_steps_per_second": 2.126, "step": 142267 }, { "epoch": 113.19, "learning_rate": 1.7736298649722002e-05, "loss": 3.3728, "step": 142500 }, { "epoch": 113.58, "learning_rate": 1.7728355837966643e-05, "loss": 3.3699, "step": 143000 }, { "epoch": 113.98, "learning_rate": 1.772041302621128e-05, "loss": 3.3688, "step": 143500 }, { "epoch": 114.0, "eval_loss": 3.2068471908569336, "eval_runtime": 68.8423, "eval_samples_per_second": 6239.261, "eval_steps_per_second": 2.034, "step": 143526 }, { "epoch": 114.38, "learning_rate": 1.771247021445592e-05, "loss": 3.3638, "step": 144000 }, { "epoch": 114.77, "learning_rate": 1.7704527402700558e-05, "loss": 3.363, "step": 144500 }, { "epoch": 115.0, "eval_loss": 3.2005867958068848, "eval_runtime": 67.6833, "eval_samples_per_second": 6346.102, "eval_steps_per_second": 2.068, "step": 144785 }, { "epoch": 115.17, "learning_rate": 1.7696584590945196e-05, "loss": 3.3645, "step": 145000 }, { "epoch": 115.57, "learning_rate": 1.7688641779189834e-05, "loss": 3.3605, "step": 145500 }, { "epoch": 115.97, "learning_rate": 1.7680698967434472e-05, "loss": 3.359, "step": 146000 }, { "epoch": 116.0, "eval_loss": 3.1976523399353027, "eval_runtime": 67.6887, "eval_samples_per_second": 6345.592, "eval_steps_per_second": 2.068, "step": 146044 }, { "epoch": 116.36, "learning_rate": 1.767275615567911e-05, "loss": 3.3571, "step": 146500 }, { "epoch": 116.76, "learning_rate": 1.7664813343923752e-05, "loss": 3.3572, "step": 147000 }, { "epoch": 117.0, "eval_loss": 3.1955084800720215, "eval_runtime": 66.1692, "eval_samples_per_second": 6491.313, "eval_steps_per_second": 2.116, "step": 147303 }, { "epoch": 117.16, "learning_rate": 1.765687053216839e-05, "loss": 3.3548, "step": 147500 }, { "epoch": 117.55, "learning_rate": 1.7648927720413028e-05, "loss": 3.3521, "step": 148000 }, { "epoch": 117.95, "learning_rate": 1.7640984908657666e-05, "loss": 3.3502, "step": 148500 }, { "epoch": 118.0, "eval_loss": 3.1862761974334717, "eval_runtime": 70.2838, "eval_samples_per_second": 6111.292, "eval_steps_per_second": 1.992, "step": 148562 }, { "epoch": 118.35, "learning_rate": 1.7633042096902304e-05, "loss": 3.3509, "step": 149000 }, { "epoch": 118.75, "learning_rate": 1.7625099285146942e-05, "loss": 3.3487, "step": 149500 }, { "epoch": 119.0, "eval_loss": 3.18820858001709, "eval_runtime": 67.244, "eval_samples_per_second": 6387.554, "eval_steps_per_second": 2.082, "step": 149821 }, { "epoch": 119.14, "learning_rate": 1.761715647339158e-05, "loss": 3.3449, "step": 150000 }, { "epoch": 119.54, "learning_rate": 1.760921366163622e-05, "loss": 3.3438, "step": 150500 }, { "epoch": 119.94, "learning_rate": 1.7601270849880857e-05, "loss": 3.3427, "step": 151000 }, { "epoch": 120.0, "eval_loss": 3.184438705444336, "eval_runtime": 69.7407, "eval_samples_per_second": 6158.882, "eval_steps_per_second": 2.007, "step": 151080 }, { "epoch": 120.33, "learning_rate": 1.7593328038125498e-05, "loss": 3.3419, "step": 151500 }, { "epoch": 120.73, "learning_rate": 1.7585385226370136e-05, "loss": 3.3415, "step": 152000 }, { "epoch": 121.0, "eval_loss": 3.175325870513916, "eval_runtime": 68.4223, "eval_samples_per_second": 6277.558, "eval_steps_per_second": 2.046, "step": 152339 }, { "epoch": 121.13, "learning_rate": 1.7577442414614774e-05, "loss": 3.3393, "step": 152500 }, { "epoch": 121.53, "learning_rate": 1.7569499602859416e-05, "loss": 3.3352, "step": 153000 }, { "epoch": 121.92, "learning_rate": 1.7561556791104054e-05, "loss": 3.3368, "step": 153500 }, { "epoch": 122.0, "eval_loss": 3.179239511489868, "eval_runtime": 68.91, "eval_samples_per_second": 6233.135, "eval_steps_per_second": 2.032, "step": 153598 }, { "epoch": 122.32, "learning_rate": 1.7553613979348692e-05, "loss": 3.3303, "step": 154000 }, { "epoch": 122.72, "learning_rate": 1.754567116759333e-05, "loss": 3.3318, "step": 154500 }, { "epoch": 123.0, "eval_loss": 3.1702351570129395, "eval_runtime": 67.6144, "eval_samples_per_second": 6352.566, "eval_steps_per_second": 2.071, "step": 154857 }, { "epoch": 123.11, "learning_rate": 1.7537728355837968e-05, "loss": 3.3317, "step": 155000 }, { "epoch": 123.51, "learning_rate": 1.7529785544082606e-05, "loss": 3.3283, "step": 155500 }, { "epoch": 123.91, "learning_rate": 1.7521842732327245e-05, "loss": 3.3265, "step": 156000 }, { "epoch": 124.0, "eval_loss": 3.1713407039642334, "eval_runtime": 67.5392, "eval_samples_per_second": 6359.637, "eval_steps_per_second": 2.073, "step": 156116 }, { "epoch": 124.31, "learning_rate": 1.7513899920571883e-05, "loss": 3.3247, "step": 156500 }, { "epoch": 124.7, "learning_rate": 1.7505957108816524e-05, "loss": 3.3248, "step": 157000 }, { "epoch": 125.0, "eval_loss": 3.1668007373809814, "eval_runtime": 68.9736, "eval_samples_per_second": 6227.381, "eval_steps_per_second": 2.03, "step": 157375 }, { "epoch": 125.1, "learning_rate": 1.7498014297061162e-05, "loss": 3.3218, "step": 157500 }, { "epoch": 125.5, "learning_rate": 1.74900714853058e-05, "loss": 3.3227, "step": 158000 }, { "epoch": 125.89, "learning_rate": 1.748212867355044e-05, "loss": 3.3188, "step": 158500 }, { "epoch": 126.0, "eval_loss": 3.1598927974700928, "eval_runtime": 66.1629, "eval_samples_per_second": 6491.932, "eval_steps_per_second": 2.116, "step": 158634 }, { "epoch": 126.29, "learning_rate": 1.7474185861795077e-05, "loss": 3.3156, "step": 159000 }, { "epoch": 126.69, "learning_rate": 1.7466243050039715e-05, "loss": 3.3181, "step": 159500 }, { "epoch": 127.0, "eval_loss": 3.161029815673828, "eval_runtime": 65.6873, "eval_samples_per_second": 6538.932, "eval_steps_per_second": 2.131, "step": 159893 }, { "epoch": 127.08, "learning_rate": 1.7458300238284353e-05, "loss": 3.3151, "step": 160000 }, { "epoch": 127.48, "learning_rate": 1.745035742652899e-05, "loss": 3.3145, "step": 160500 }, { "epoch": 127.88, "learning_rate": 1.744241461477363e-05, "loss": 3.3124, "step": 161000 }, { "epoch": 128.0, "eval_loss": 3.152050495147705, "eval_runtime": 66.7887, "eval_samples_per_second": 6431.103, "eval_steps_per_second": 2.096, "step": 161152 }, { "epoch": 128.28, "learning_rate": 1.743447180301827e-05, "loss": 3.3151, "step": 161500 }, { "epoch": 128.67, "learning_rate": 1.742652899126291e-05, "loss": 3.3072, "step": 162000 }, { "epoch": 129.0, "eval_loss": 3.154831886291504, "eval_runtime": 66.511, "eval_samples_per_second": 6457.958, "eval_steps_per_second": 2.105, "step": 162411 }, { "epoch": 129.07, "learning_rate": 1.7418586179507547e-05, "loss": 3.3061, "step": 162500 }, { "epoch": 129.47, "learning_rate": 1.7410643367752185e-05, "loss": 3.3069, "step": 163000 }, { "epoch": 129.86, "learning_rate": 1.7402700555996826e-05, "loss": 3.3048, "step": 163500 }, { "epoch": 130.0, "eval_loss": 3.1463096141815186, "eval_runtime": 69.1425, "eval_samples_per_second": 6212.168, "eval_steps_per_second": 2.025, "step": 163670 }, { "epoch": 130.26, "learning_rate": 1.7394757744241464e-05, "loss": 3.3077, "step": 164000 }, { "epoch": 130.66, "learning_rate": 1.7386814932486103e-05, "loss": 3.3014, "step": 164500 }, { "epoch": 131.0, "eval_loss": 3.150258779525757, "eval_runtime": 67.5137, "eval_samples_per_second": 6362.046, "eval_steps_per_second": 2.074, "step": 164929 }, { "epoch": 131.06, "learning_rate": 1.737887212073074e-05, "loss": 3.3016, "step": 165000 }, { "epoch": 131.45, "learning_rate": 1.737092930897538e-05, "loss": 3.299, "step": 165500 }, { "epoch": 131.85, "learning_rate": 1.7362986497220017e-05, "loss": 3.2973, "step": 166000 }, { "epoch": 132.0, "eval_loss": 3.137458562850952, "eval_runtime": 65.1087, "eval_samples_per_second": 6597.047, "eval_steps_per_second": 2.15, "step": 166188 }, { "epoch": 132.25, "learning_rate": 1.7355043685464655e-05, "loss": 3.2985, "step": 166500 }, { "epoch": 132.64, "learning_rate": 1.7347100873709296e-05, "loss": 3.2981, "step": 167000 }, { "epoch": 133.0, "eval_loss": 3.141355514526367, "eval_runtime": 67.2148, "eval_samples_per_second": 6390.33, "eval_steps_per_second": 2.083, "step": 167447 }, { "epoch": 133.04, "learning_rate": 1.7339158061953935e-05, "loss": 3.2958, "step": 167500 }, { "epoch": 133.44, "learning_rate": 1.7331215250198573e-05, "loss": 3.2958, "step": 168000 }, { "epoch": 133.84, "learning_rate": 1.732327243844321e-05, "loss": 3.2967, "step": 168500 }, { "epoch": 134.0, "eval_loss": 3.1341967582702637, "eval_runtime": 68.1661, "eval_samples_per_second": 6301.149, "eval_steps_per_second": 2.054, "step": 168706 }, { "epoch": 134.23, "learning_rate": 1.731532962668785e-05, "loss": 3.2893, "step": 169000 }, { "epoch": 134.63, "learning_rate": 1.7307386814932487e-05, "loss": 3.2895, "step": 169500 }, { "epoch": 135.0, "eval_loss": 3.1283481121063232, "eval_runtime": 66.489, "eval_samples_per_second": 6460.089, "eval_steps_per_second": 2.106, "step": 169965 }, { "epoch": 135.03, "learning_rate": 1.7299444003177125e-05, "loss": 3.2901, "step": 170000 }, { "epoch": 135.42, "learning_rate": 1.7291501191421763e-05, "loss": 3.2859, "step": 170500 }, { "epoch": 135.82, "learning_rate": 1.72835583796664e-05, "loss": 3.2873, "step": 171000 }, { "epoch": 136.0, "eval_loss": 3.124046802520752, "eval_runtime": 66.2074, "eval_samples_per_second": 6487.569, "eval_steps_per_second": 2.115, "step": 171224 }, { "epoch": 136.22, "learning_rate": 1.7275615567911043e-05, "loss": 3.2856, "step": 171500 }, { "epoch": 136.62, "learning_rate": 1.726767275615568e-05, "loss": 3.2823, "step": 172000 }, { "epoch": 137.0, "eval_loss": 3.125671625137329, "eval_runtime": 66.722, "eval_samples_per_second": 6437.534, "eval_steps_per_second": 2.098, "step": 172483 }, { "epoch": 137.01, "learning_rate": 1.725972994440032e-05, "loss": 3.2824, "step": 172500 }, { "epoch": 137.41, "learning_rate": 1.7251787132644957e-05, "loss": 3.2802, "step": 173000 }, { "epoch": 137.81, "learning_rate": 1.7243844320889595e-05, "loss": 3.2787, "step": 173500 }, { "epoch": 138.0, "eval_loss": 3.1257822513580322, "eval_runtime": 66.6533, "eval_samples_per_second": 6444.166, "eval_steps_per_second": 2.1, "step": 173742 }, { "epoch": 138.2, "learning_rate": 1.7235901509134233e-05, "loss": 3.2785, "step": 174000 }, { "epoch": 138.6, "learning_rate": 1.722795869737887e-05, "loss": 3.277, "step": 174500 }, { "epoch": 139.0, "learning_rate": 1.722001588562351e-05, "loss": 3.274, "step": 175000 }, { "epoch": 139.0, "eval_loss": 3.1187708377838135, "eval_runtime": 65.9797, "eval_samples_per_second": 6509.959, "eval_steps_per_second": 2.122, "step": 175001 }, { "epoch": 139.4, "learning_rate": 1.721207307386815e-05, "loss": 3.277, "step": 175500 }, { "epoch": 139.79, "learning_rate": 1.720413026211279e-05, "loss": 3.2741, "step": 176000 }, { "epoch": 140.0, "eval_loss": 3.1154379844665527, "eval_runtime": 65.1476, "eval_samples_per_second": 6593.106, "eval_steps_per_second": 2.149, "step": 176260 }, { "epoch": 140.19, "learning_rate": 1.7196187450357427e-05, "loss": 3.2736, "step": 176500 }, { "epoch": 140.59, "learning_rate": 1.718824463860207e-05, "loss": 3.2685, "step": 177000 }, { "epoch": 140.98, "learning_rate": 1.7180301826846707e-05, "loss": 3.27, "step": 177500 }, { "epoch": 141.0, "eval_loss": 3.115006446838379, "eval_runtime": 65.9756, "eval_samples_per_second": 6510.361, "eval_steps_per_second": 2.122, "step": 177519 }, { "epoch": 141.38, "learning_rate": 1.7172359015091345e-05, "loss": 3.2685, "step": 178000 }, { "epoch": 141.78, "learning_rate": 1.7164416203335983e-05, "loss": 3.2679, "step": 178500 }, { "epoch": 142.0, "eval_loss": 3.107903003692627, "eval_runtime": 66.0823, "eval_samples_per_second": 6499.85, "eval_steps_per_second": 2.119, "step": 178778 }, { "epoch": 142.18, "learning_rate": 1.715647339158062e-05, "loss": 3.2668, "step": 179000 }, { "epoch": 142.57, "learning_rate": 1.714853057982526e-05, "loss": 3.265, "step": 179500 }, { "epoch": 142.97, "learning_rate": 1.7140587768069898e-05, "loss": 3.2635, "step": 180000 }, { "epoch": 143.0, "eval_loss": 3.105329990386963, "eval_runtime": 66.3798, "eval_samples_per_second": 6470.715, "eval_steps_per_second": 2.109, "step": 180037 }, { "epoch": 143.37, "learning_rate": 1.7132644956314536e-05, "loss": 3.2633, "step": 180500 }, { "epoch": 143.76, "learning_rate": 1.7124702144559177e-05, "loss": 3.2607, "step": 181000 }, { "epoch": 144.0, "eval_loss": 3.1035091876983643, "eval_runtime": 65.8577, "eval_samples_per_second": 6522.012, "eval_steps_per_second": 2.126, "step": 181296 }, { "epoch": 144.16, "learning_rate": 1.7116759332803815e-05, "loss": 3.2625, "step": 181500 }, { "epoch": 144.56, "learning_rate": 1.7108816521048453e-05, "loss": 3.2567, "step": 182000 }, { "epoch": 144.96, "learning_rate": 1.710087370929309e-05, "loss": 3.2569, "step": 182500 }, { "epoch": 145.0, "eval_loss": 3.100468158721924, "eval_runtime": 65.1424, "eval_samples_per_second": 6593.633, "eval_steps_per_second": 2.149, "step": 182555 }, { "epoch": 145.35, "learning_rate": 1.709293089753773e-05, "loss": 3.2539, "step": 183000 }, { "epoch": 145.75, "learning_rate": 1.7084988085782368e-05, "loss": 3.2553, "step": 183500 }, { "epoch": 146.0, "eval_loss": 3.1042771339416504, "eval_runtime": 66.0185, "eval_samples_per_second": 6506.128, "eval_steps_per_second": 2.121, "step": 183814 }, { "epoch": 146.15, "learning_rate": 1.7077045274027006e-05, "loss": 3.2557, "step": 184000 }, { "epoch": 146.54, "learning_rate": 1.7069102462271644e-05, "loss": 3.2522, "step": 184500 }, { "epoch": 146.94, "learning_rate": 1.7061159650516282e-05, "loss": 3.254, "step": 185000 }, { "epoch": 147.0, "eval_loss": 3.096667766571045, "eval_runtime": 66.2171, "eval_samples_per_second": 6486.615, "eval_steps_per_second": 2.114, "step": 185073 }, { "epoch": 147.34, "learning_rate": 1.7053216838760924e-05, "loss": 3.2498, "step": 185500 }, { "epoch": 147.74, "learning_rate": 1.704527402700556e-05, "loss": 3.2514, "step": 186000 }, { "epoch": 148.0, "eval_loss": 3.0894362926483154, "eval_runtime": 65.4891, "eval_samples_per_second": 6558.726, "eval_steps_per_second": 2.138, "step": 186332 }, { "epoch": 148.13, "learning_rate": 1.70373312152502e-05, "loss": 3.2487, "step": 186500 }, { "epoch": 148.53, "learning_rate": 1.7029388403494838e-05, "loss": 3.2473, "step": 187000 }, { "epoch": 148.93, "learning_rate": 1.7021445591739476e-05, "loss": 3.2472, "step": 187500 }, { "epoch": 149.0, "eval_loss": 3.088639974594116, "eval_runtime": 64.6766, "eval_samples_per_second": 6641.12, "eval_steps_per_second": 2.165, "step": 187591 }, { "epoch": 149.32, "learning_rate": 1.7013502779984117e-05, "loss": 3.2451, "step": 188000 }, { "epoch": 149.72, "learning_rate": 1.7005559968228756e-05, "loss": 3.2449, "step": 188500 }, { "epoch": 150.0, "eval_loss": 3.0863802433013916, "eval_runtime": 66.6012, "eval_samples_per_second": 6449.206, "eval_steps_per_second": 2.102, "step": 188850 }, { "epoch": 150.12, "learning_rate": 1.6997617156473394e-05, "loss": 3.2428, "step": 189000 }, { "epoch": 150.52, "learning_rate": 1.6989674344718032e-05, "loss": 3.243, "step": 189500 }, { "epoch": 150.91, "learning_rate": 1.698173153296267e-05, "loss": 3.241, "step": 190000 }, { "epoch": 151.0, "eval_loss": 3.0850908756256104, "eval_runtime": 66.3958, "eval_samples_per_second": 6469.162, "eval_steps_per_second": 2.109, "step": 190109 }, { "epoch": 151.31, "learning_rate": 1.6973788721207308e-05, "loss": 3.2395, "step": 190500 }, { "epoch": 151.71, "learning_rate": 1.696584590945195e-05, "loss": 3.236, "step": 191000 }, { "epoch": 152.0, "eval_loss": 3.081743001937866, "eval_runtime": 66.9658, "eval_samples_per_second": 6414.094, "eval_steps_per_second": 2.091, "step": 191368 }, { "epoch": 152.1, "learning_rate": 1.6957903097696588e-05, "loss": 3.239, "step": 191500 }, { "epoch": 152.5, "learning_rate": 1.6949960285941226e-05, "loss": 3.237, "step": 192000 }, { "epoch": 152.9, "learning_rate": 1.6942017474185864e-05, "loss": 3.2335, "step": 192500 }, { "epoch": 153.0, "eval_loss": 3.0783889293670654, "eval_runtime": 65.8942, "eval_samples_per_second": 6518.402, "eval_steps_per_second": 2.125, "step": 192627 }, { "epoch": 153.3, "learning_rate": 1.6934074662430502e-05, "loss": 3.2341, "step": 193000 }, { "epoch": 153.69, "learning_rate": 1.692613185067514e-05, "loss": 3.2366, "step": 193500 }, { "epoch": 154.0, "eval_loss": 3.0786499977111816, "eval_runtime": 65.3086, "eval_samples_per_second": 6576.852, "eval_steps_per_second": 2.144, "step": 193886 }, { "epoch": 154.09, "learning_rate": 1.6918189038919778e-05, "loss": 3.2286, "step": 194000 }, { "epoch": 154.49, "learning_rate": 1.6910246227164416e-05, "loss": 3.2319, "step": 194500 }, { "epoch": 154.88, "learning_rate": 1.6902303415409054e-05, "loss": 3.2303, "step": 195000 }, { "epoch": 155.0, "eval_loss": 3.071290969848633, "eval_runtime": 69.2208, "eval_samples_per_second": 6205.141, "eval_steps_per_second": 2.023, "step": 195145 }, { "epoch": 155.28, "learning_rate": 1.6894360603653696e-05, "loss": 3.2274, "step": 195500 }, { "epoch": 155.68, "learning_rate": 1.6886417791898334e-05, "loss": 3.2256, "step": 196000 }, { "epoch": 156.0, "eval_loss": 3.0702176094055176, "eval_runtime": 66.5292, "eval_samples_per_second": 6456.188, "eval_steps_per_second": 2.104, "step": 196404 }, { "epoch": 156.08, "learning_rate": 1.6878474980142972e-05, "loss": 3.2265, "step": 196500 }, { "epoch": 156.47, "learning_rate": 1.687053216838761e-05, "loss": 3.226, "step": 197000 }, { "epoch": 156.87, "learning_rate": 1.686258935663225e-05, "loss": 3.2236, "step": 197500 }, { "epoch": 157.0, "eval_loss": 3.0689454078674316, "eval_runtime": 65.5204, "eval_samples_per_second": 6555.593, "eval_steps_per_second": 2.137, "step": 197663 }, { "epoch": 157.27, "learning_rate": 1.6854646544876886e-05, "loss": 3.2239, "step": 198000 }, { "epoch": 157.66, "learning_rate": 1.6846703733121525e-05, "loss": 3.2216, "step": 198500 }, { "epoch": 158.0, "eval_loss": 3.065370798110962, "eval_runtime": 65.6323, "eval_samples_per_second": 6544.412, "eval_steps_per_second": 2.133, "step": 198922 }, { "epoch": 158.06, "learning_rate": 1.6838760921366163e-05, "loss": 3.2211, "step": 199000 }, { "epoch": 158.46, "learning_rate": 1.6830818109610804e-05, "loss": 3.2198, "step": 199500 }, { "epoch": 158.86, "learning_rate": 1.6822875297855442e-05, "loss": 3.2192, "step": 200000 }, { "epoch": 159.0, "eval_loss": 3.066819429397583, "eval_runtime": 65.241, "eval_samples_per_second": 6583.671, "eval_steps_per_second": 2.146, "step": 200181 }, { "epoch": 159.25, "learning_rate": 1.681493248610008e-05, "loss": 3.2202, "step": 200500 }, { "epoch": 159.65, "learning_rate": 1.6806989674344722e-05, "loss": 3.2166, "step": 201000 }, { "epoch": 160.0, "eval_loss": 3.0597753524780273, "eval_runtime": 65.8442, "eval_samples_per_second": 6523.358, "eval_steps_per_second": 2.126, "step": 201440 }, { "epoch": 160.05, "learning_rate": 1.679904686258936e-05, "loss": 3.2178, "step": 201500 }, { "epoch": 160.44, "learning_rate": 1.6791104050833998e-05, "loss": 3.2123, "step": 202000 }, { "epoch": 160.84, "learning_rate": 1.6783161239078636e-05, "loss": 3.2131, "step": 202500 }, { "epoch": 161.0, "eval_loss": 3.058962106704712, "eval_runtime": 67.0347, "eval_samples_per_second": 6407.5, "eval_steps_per_second": 2.088, "step": 202699 }, { "epoch": 161.24, "learning_rate": 1.6775218427323274e-05, "loss": 3.2135, "step": 203000 }, { "epoch": 161.64, "learning_rate": 1.6767275615567912e-05, "loss": 3.211, "step": 203500 }, { "epoch": 162.0, "eval_loss": 3.0535287857055664, "eval_runtime": 67.9305, "eval_samples_per_second": 6323.007, "eval_steps_per_second": 2.061, "step": 203958 }, { "epoch": 162.03, "learning_rate": 1.675933280381255e-05, "loss": 3.2096, "step": 204000 }, { "epoch": 162.43, "learning_rate": 1.675138999205719e-05, "loss": 3.2106, "step": 204500 }, { "epoch": 162.83, "learning_rate": 1.6743447180301827e-05, "loss": 3.2074, "step": 205000 }, { "epoch": 163.0, "eval_loss": 3.0549418926239014, "eval_runtime": 65.8879, "eval_samples_per_second": 6519.03, "eval_steps_per_second": 2.125, "step": 205217 }, { "epoch": 163.22, "learning_rate": 1.6735504368546468e-05, "loss": 3.2092, "step": 205500 }, { "epoch": 163.62, "learning_rate": 1.6727561556791106e-05, "loss": 3.2055, "step": 206000 }, { "epoch": 164.0, "eval_loss": 3.0483028888702393, "eval_runtime": 66.7146, "eval_samples_per_second": 6438.247, "eval_steps_per_second": 2.098, "step": 206476 }, { "epoch": 164.02, "learning_rate": 1.6719618745035744e-05, "loss": 3.2077, "step": 206500 }, { "epoch": 164.42, "learning_rate": 1.6711675933280383e-05, "loss": 3.2058, "step": 207000 }, { "epoch": 164.81, "learning_rate": 1.670373312152502e-05, "loss": 3.2027, "step": 207500 }, { "epoch": 165.0, "eval_loss": 3.0438058376312256, "eval_runtime": 69.564, "eval_samples_per_second": 6174.528, "eval_steps_per_second": 2.013, "step": 207735 }, { "epoch": 165.21, "learning_rate": 1.669579030976966e-05, "loss": 3.2033, "step": 208000 }, { "epoch": 165.61, "learning_rate": 1.6687847498014297e-05, "loss": 3.2015, "step": 208500 }, { "epoch": 166.0, "eval_loss": 3.0477957725524902, "eval_runtime": 69.5128, "eval_samples_per_second": 6179.076, "eval_steps_per_second": 2.014, "step": 208994 }, { "epoch": 166.0, "learning_rate": 1.6679904686258935e-05, "loss": 3.2015, "step": 209000 }, { "epoch": 166.4, "learning_rate": 1.6671961874503577e-05, "loss": 3.2014, "step": 209500 }, { "epoch": 166.8, "learning_rate": 1.6664019062748215e-05, "loss": 3.1994, "step": 210000 }, { "epoch": 167.0, "eval_loss": 3.043379783630371, "eval_runtime": 66.3708, "eval_samples_per_second": 6471.597, "eval_steps_per_second": 2.109, "step": 210253 }, { "epoch": 167.2, "learning_rate": 1.6656076250992853e-05, "loss": 3.1976, "step": 210500 }, { "epoch": 167.59, "learning_rate": 1.664813343923749e-05, "loss": 3.1961, "step": 211000 }, { "epoch": 167.99, "learning_rate": 1.664019062748213e-05, "loss": 3.1992, "step": 211500 }, { "epoch": 168.0, "eval_loss": 3.040722131729126, "eval_runtime": 66.107, "eval_samples_per_second": 6497.419, "eval_steps_per_second": 2.118, "step": 211512 }, { "epoch": 168.39, "learning_rate": 1.663224781572677e-05, "loss": 3.1943, "step": 212000 }, { "epoch": 168.78, "learning_rate": 1.662430500397141e-05, "loss": 3.1947, "step": 212500 }, { "epoch": 169.0, "eval_loss": 3.0416460037231445, "eval_runtime": 68.4694, "eval_samples_per_second": 6273.243, "eval_steps_per_second": 2.045, "step": 212771 }, { "epoch": 169.18, "learning_rate": 1.6616362192216047e-05, "loss": 3.196, "step": 213000 }, { "epoch": 169.58, "learning_rate": 1.6608419380460685e-05, "loss": 3.1882, "step": 213500 }, { "epoch": 169.98, "learning_rate": 1.6600476568705323e-05, "loss": 3.1922, "step": 214000 }, { "epoch": 170.0, "eval_loss": 3.0347278118133545, "eval_runtime": 68.2044, "eval_samples_per_second": 6297.616, "eval_steps_per_second": 2.053, "step": 214030 }, { "epoch": 170.37, "learning_rate": 1.659253375694996e-05, "loss": 3.1876, "step": 214500 }, { "epoch": 170.77, "learning_rate": 1.65845909451946e-05, "loss": 3.1907, "step": 215000 }, { "epoch": 171.0, "eval_loss": 3.0383665561676025, "eval_runtime": 66.7627, "eval_samples_per_second": 6433.608, "eval_steps_per_second": 2.097, "step": 215289 }, { "epoch": 171.17, "learning_rate": 1.657664813343924e-05, "loss": 3.1908, "step": 215500 }, { "epoch": 171.56, "learning_rate": 1.656870532168388e-05, "loss": 3.1873, "step": 216000 }, { "epoch": 171.96, "learning_rate": 1.6560762509928517e-05, "loss": 3.1835, "step": 216500 }, { "epoch": 172.0, "eval_loss": 3.0322234630584717, "eval_runtime": 66.2298, "eval_samples_per_second": 6485.37, "eval_steps_per_second": 2.114, "step": 216548 }, { "epoch": 172.36, "learning_rate": 1.6552819698173155e-05, "loss": 3.188, "step": 217000 }, { "epoch": 172.76, "learning_rate": 1.6544876886417793e-05, "loss": 3.1865, "step": 217500 }, { "epoch": 173.0, "eval_loss": 3.031705141067505, "eval_runtime": 67.9322, "eval_samples_per_second": 6322.851, "eval_steps_per_second": 2.061, "step": 217807 }, { "epoch": 173.15, "learning_rate": 1.653693407466243e-05, "loss": 3.1843, "step": 218000 }, { "epoch": 173.55, "learning_rate": 1.652899126290707e-05, "loss": 3.1809, "step": 218500 }, { "epoch": 173.95, "learning_rate": 1.6521048451151707e-05, "loss": 3.1824, "step": 219000 }, { "epoch": 174.0, "eval_loss": 3.026932954788208, "eval_runtime": 65.6782, "eval_samples_per_second": 6539.845, "eval_steps_per_second": 2.132, "step": 219066 }, { "epoch": 174.34, "learning_rate": 1.651310563939635e-05, "loss": 3.1827, "step": 219500 }, { "epoch": 174.74, "learning_rate": 1.6505162827640987e-05, "loss": 3.1774, "step": 220000 }, { "epoch": 175.0, "eval_loss": 3.025967597961426, "eval_runtime": 65.154, "eval_samples_per_second": 6592.453, "eval_steps_per_second": 2.149, "step": 220325 }, { "epoch": 175.14, "learning_rate": 1.6497220015885625e-05, "loss": 3.178, "step": 220500 }, { "epoch": 175.54, "learning_rate": 1.6489277204130263e-05, "loss": 3.1794, "step": 221000 }, { "epoch": 175.93, "learning_rate": 1.64813343923749e-05, "loss": 3.1794, "step": 221500 }, { "epoch": 176.0, "eval_loss": 3.029658079147339, "eval_runtime": 67.8952, "eval_samples_per_second": 6326.289, "eval_steps_per_second": 2.062, "step": 221584 }, { "epoch": 176.33, "learning_rate": 1.647339158061954e-05, "loss": 3.1753, "step": 222000 }, { "epoch": 176.73, "learning_rate": 1.6465448768864178e-05, "loss": 3.1748, "step": 222500 }, { "epoch": 177.0, "eval_loss": 3.0214033126831055, "eval_runtime": 65.881, "eval_samples_per_second": 6519.707, "eval_steps_per_second": 2.125, "step": 222843 }, { "epoch": 177.12, "learning_rate": 1.6457505957108816e-05, "loss": 3.1752, "step": 223000 }, { "epoch": 177.52, "learning_rate": 1.6449563145353454e-05, "loss": 3.1776, "step": 223500 }, { "epoch": 177.92, "learning_rate": 1.6441620333598095e-05, "loss": 3.1726, "step": 224000 }, { "epoch": 178.0, "eval_loss": 3.014864206314087, "eval_runtime": 68.4186, "eval_samples_per_second": 6277.897, "eval_steps_per_second": 2.046, "step": 224102 }, { "epoch": 178.32, "learning_rate": 1.6433677521842733e-05, "loss": 3.1694, "step": 224500 }, { "epoch": 178.71, "learning_rate": 1.6425734710087375e-05, "loss": 3.1724, "step": 225000 }, { "epoch": 179.0, "eval_loss": 3.019036054611206, "eval_runtime": 67.327, "eval_samples_per_second": 6379.683, "eval_steps_per_second": 2.079, "step": 225361 }, { "epoch": 179.11, "learning_rate": 1.6417791898332013e-05, "loss": 3.1691, "step": 225500 }, { "epoch": 179.51, "learning_rate": 1.640984908657665e-05, "loss": 3.168, "step": 226000 }, { "epoch": 179.9, "learning_rate": 1.640190627482129e-05, "loss": 3.1712, "step": 226500 }, { "epoch": 180.0, "eval_loss": 3.016796112060547, "eval_runtime": 67.9067, "eval_samples_per_second": 6325.224, "eval_steps_per_second": 2.062, "step": 226620 }, { "epoch": 180.3, "learning_rate": 1.6393963463065927e-05, "loss": 3.1675, "step": 227000 }, { "epoch": 180.7, "learning_rate": 1.6386020651310565e-05, "loss": 3.1696, "step": 227500 }, { "epoch": 181.0, "eval_loss": 3.011127233505249, "eval_runtime": 65.7782, "eval_samples_per_second": 6529.897, "eval_steps_per_second": 2.128, "step": 227879 }, { "epoch": 181.1, "learning_rate": 1.6378077839555204e-05, "loss": 3.1639, "step": 228000 }, { "epoch": 181.49, "learning_rate": 1.637013502779984e-05, "loss": 3.1642, "step": 228500 }, { "epoch": 181.89, "learning_rate": 1.636219221604448e-05, "loss": 3.1655, "step": 229000 }, { "epoch": 182.0, "eval_loss": 3.0097951889038086, "eval_runtime": 65.8109, "eval_samples_per_second": 6526.656, "eval_steps_per_second": 2.127, "step": 229138 }, { "epoch": 182.29, "learning_rate": 1.635424940428912e-05, "loss": 3.1634, "step": 229500 }, { "epoch": 182.68, "learning_rate": 1.634630659253376e-05, "loss": 3.163, "step": 230000 }, { "epoch": 183.0, "eval_loss": 3.0031604766845703, "eval_runtime": 66.7249, "eval_samples_per_second": 6437.251, "eval_steps_per_second": 2.098, "step": 230397 }, { "epoch": 183.08, "learning_rate": 1.6338363780778397e-05, "loss": 3.1626, "step": 230500 }, { "epoch": 183.48, "learning_rate": 1.6330420969023036e-05, "loss": 3.1607, "step": 231000 }, { "epoch": 183.88, "learning_rate": 1.6322478157267674e-05, "loss": 3.1602, "step": 231500 }, { "epoch": 184.0, "eval_loss": 3.005112886428833, "eval_runtime": 66.9258, "eval_samples_per_second": 6417.929, "eval_steps_per_second": 2.092, "step": 231656 }, { "epoch": 184.27, "learning_rate": 1.6314535345512312e-05, "loss": 3.1592, "step": 232000 }, { "epoch": 184.67, "learning_rate": 1.630659253375695e-05, "loss": 3.1614, "step": 232500 }, { "epoch": 185.0, "eval_loss": 3.0034019947052, "eval_runtime": 65.938, "eval_samples_per_second": 6514.074, "eval_steps_per_second": 2.123, "step": 232915 }, { "epoch": 185.07, "learning_rate": 1.6298649722001588e-05, "loss": 3.158, "step": 233000 }, { "epoch": 185.46, "learning_rate": 1.629070691024623e-05, "loss": 3.1587, "step": 233500 }, { "epoch": 185.86, "learning_rate": 1.6282764098490868e-05, "loss": 3.156, "step": 234000 }, { "epoch": 186.0, "eval_loss": 2.9994444847106934, "eval_runtime": 68.12, "eval_samples_per_second": 6305.418, "eval_steps_per_second": 2.055, "step": 234174 }, { "epoch": 186.26, "learning_rate": 1.6274821286735506e-05, "loss": 3.155, "step": 234500 }, { "epoch": 186.66, "learning_rate": 1.6266878474980144e-05, "loss": 3.1554, "step": 235000 }, { "epoch": 187.0, "eval_loss": 3.0035860538482666, "eval_runtime": 67.5701, "eval_samples_per_second": 6356.733, "eval_steps_per_second": 2.072, "step": 235433 }, { "epoch": 187.05, "learning_rate": 1.6258935663224782e-05, "loss": 3.1547, "step": 235500 }, { "epoch": 187.45, "learning_rate": 1.625099285146942e-05, "loss": 3.1524, "step": 236000 }, { "epoch": 187.85, "learning_rate": 1.624305003971406e-05, "loss": 3.154, "step": 236500 }, { "epoch": 188.0, "eval_loss": 3.0000782012939453, "eval_runtime": 70.0288, "eval_samples_per_second": 6133.549, "eval_steps_per_second": 1.999, "step": 236692 }, { "epoch": 188.24, "learning_rate": 1.62351072279587e-05, "loss": 3.1512, "step": 237000 }, { "epoch": 188.64, "learning_rate": 1.6227164416203338e-05, "loss": 3.1509, "step": 237500 }, { "epoch": 189.0, "eval_loss": 3.0007848739624023, "eval_runtime": 65.5374, "eval_samples_per_second": 6553.891, "eval_steps_per_second": 2.136, "step": 237951 }, { "epoch": 189.04, "learning_rate": 1.6219221604447976e-05, "loss": 3.1512, "step": 238000 }, { "epoch": 189.44, "learning_rate": 1.6211278792692614e-05, "loss": 3.1506, "step": 238500 }, { "epoch": 189.83, "learning_rate": 1.6203335980937252e-05, "loss": 3.1479, "step": 239000 }, { "epoch": 190.0, "eval_loss": 2.9929776191711426, "eval_runtime": 66.7167, "eval_samples_per_second": 6438.043, "eval_steps_per_second": 2.098, "step": 239210 }, { "epoch": 190.23, "learning_rate": 1.6195393169181894e-05, "loss": 3.1498, "step": 239500 }, { "epoch": 190.63, "learning_rate": 1.6187450357426532e-05, "loss": 3.1462, "step": 240000 }, { "epoch": 191.0, "eval_loss": 2.9895858764648438, "eval_runtime": 67.9606, "eval_samples_per_second": 6320.204, "eval_steps_per_second": 2.06, "step": 240469 }, { "epoch": 191.02, "learning_rate": 1.617950754567117e-05, "loss": 3.1475, "step": 240500 }, { "epoch": 191.42, "learning_rate": 1.6171564733915808e-05, "loss": 3.1465, "step": 241000 }, { "epoch": 191.82, "learning_rate": 1.6163621922160446e-05, "loss": 3.1446, "step": 241500 }, { "epoch": 192.0, "eval_loss": 2.9905622005462646, "eval_runtime": 65.6203, "eval_samples_per_second": 6545.615, "eval_steps_per_second": 2.133, "step": 241728 }, { "epoch": 192.22, "learning_rate": 1.6155679110405084e-05, "loss": 3.1436, "step": 242000 }, { "epoch": 192.61, "learning_rate": 1.6147736298649722e-05, "loss": 3.1454, "step": 242500 }, { "epoch": 193.0, "eval_loss": 2.990659475326538, "eval_runtime": 66.7234, "eval_samples_per_second": 6437.4, "eval_steps_per_second": 2.098, "step": 242987 }, { "epoch": 193.01, "learning_rate": 1.613979348689436e-05, "loss": 3.1444, "step": 243000 }, { "epoch": 193.41, "learning_rate": 1.6131850675139002e-05, "loss": 3.1417, "step": 243500 }, { "epoch": 193.8, "learning_rate": 1.612390786338364e-05, "loss": 3.1413, "step": 244000 }, { "epoch": 194.0, "eval_loss": 2.9828615188598633, "eval_runtime": 66.2473, "eval_samples_per_second": 6483.659, "eval_steps_per_second": 2.113, "step": 244246 }, { "epoch": 194.2, "learning_rate": 1.6115965051628278e-05, "loss": 3.1408, "step": 244500 }, { "epoch": 194.6, "learning_rate": 1.6108022239872916e-05, "loss": 3.1385, "step": 245000 }, { "epoch": 195.0, "learning_rate": 1.6100079428117554e-05, "loss": 3.1359, "step": 245500 }, { "epoch": 195.0, "eval_loss": 2.9874227046966553, "eval_runtime": 65.5687, "eval_samples_per_second": 6550.767, "eval_steps_per_second": 2.135, "step": 245505 }, { "epoch": 195.39, "learning_rate": 1.6092136616362192e-05, "loss": 3.1408, "step": 246000 }, { "epoch": 195.79, "learning_rate": 1.608419380460683e-05, "loss": 3.1353, "step": 246500 }, { "epoch": 196.0, "eval_loss": 2.9848384857177734, "eval_runtime": 67.1551, "eval_samples_per_second": 6396.015, "eval_steps_per_second": 2.085, "step": 246764 }, { "epoch": 196.19, "learning_rate": 1.607625099285147e-05, "loss": 3.1378, "step": 247000 }, { "epoch": 196.58, "learning_rate": 1.6068308181096107e-05, "loss": 3.1353, "step": 247500 }, { "epoch": 196.98, "learning_rate": 1.6060365369340748e-05, "loss": 3.1358, "step": 248000 }, { "epoch": 197.0, "eval_loss": 2.9764184951782227, "eval_runtime": 65.592, "eval_samples_per_second": 6548.433, "eval_steps_per_second": 2.134, "step": 248023 }, { "epoch": 197.38, "learning_rate": 1.6052422557585386e-05, "loss": 3.1362, "step": 248500 }, { "epoch": 197.78, "learning_rate": 1.6044479745830025e-05, "loss": 3.132, "step": 249000 }, { "epoch": 198.0, "eval_loss": 2.981771945953369, "eval_runtime": 67.0618, "eval_samples_per_second": 6404.909, "eval_steps_per_second": 2.088, "step": 249282 }, { "epoch": 198.17, "learning_rate": 1.6036536934074666e-05, "loss": 3.1366, "step": 249500 }, { "epoch": 198.57, "learning_rate": 1.6028594122319304e-05, "loss": 3.1325, "step": 250000 }, { "epoch": 198.97, "learning_rate": 1.6020651310563942e-05, "loss": 3.1299, "step": 250500 }, { "epoch": 199.0, "eval_loss": 2.9742584228515625, "eval_runtime": 65.5313, "eval_samples_per_second": 6554.506, "eval_steps_per_second": 2.136, "step": 250541 }, { "epoch": 199.36, "learning_rate": 1.601270849880858e-05, "loss": 3.1287, "step": 251000 }, { "epoch": 199.76, "learning_rate": 1.600476568705322e-05, "loss": 3.1274, "step": 251500 }, { "epoch": 200.0, "eval_loss": 2.979449987411499, "eval_runtime": 66.7024, "eval_samples_per_second": 6439.425, "eval_steps_per_second": 2.099, "step": 251800 }, { "epoch": 200.16, "learning_rate": 1.5996822875297857e-05, "loss": 3.1325, "step": 252000 }, { "epoch": 200.56, "learning_rate": 1.5988880063542495e-05, "loss": 3.1293, "step": 252500 }, { "epoch": 200.95, "learning_rate": 1.5980937251787133e-05, "loss": 3.1289, "step": 253000 }, { "epoch": 201.0, "eval_loss": 2.9777655601501465, "eval_runtime": 65.3465, "eval_samples_per_second": 6573.033, "eval_steps_per_second": 2.142, "step": 253059 }, { "epoch": 201.35, "learning_rate": 1.5972994440031774e-05, "loss": 3.1268, "step": 253500 }, { "epoch": 201.75, "learning_rate": 1.5965051628276412e-05, "loss": 3.1262, "step": 254000 }, { "epoch": 202.0, "eval_loss": 2.9709866046905518, "eval_runtime": 66.479, "eval_samples_per_second": 6461.063, "eval_steps_per_second": 2.106, "step": 254318 }, { "epoch": 202.14, "learning_rate": 1.595710881652105e-05, "loss": 3.1272, "step": 254500 }, { "epoch": 202.54, "learning_rate": 1.594916600476569e-05, "loss": 3.1245, "step": 255000 }, { "epoch": 202.94, "learning_rate": 1.5941223193010327e-05, "loss": 3.1236, "step": 255500 }, { "epoch": 203.0, "eval_loss": 2.970989465713501, "eval_runtime": 67.4753, "eval_samples_per_second": 6365.664, "eval_steps_per_second": 2.075, "step": 255577 }, { "epoch": 203.34, "learning_rate": 1.5933280381254965e-05, "loss": 3.1249, "step": 256000 }, { "epoch": 203.73, "learning_rate": 1.5925337569499603e-05, "loss": 3.1211, "step": 256500 }, { "epoch": 204.0, "eval_loss": 2.974815845489502, "eval_runtime": 69.0984, "eval_samples_per_second": 6216.136, "eval_steps_per_second": 2.026, "step": 256836 }, { "epoch": 204.13, "learning_rate": 1.591739475774424e-05, "loss": 3.1222, "step": 257000 }, { "epoch": 204.53, "learning_rate": 1.590945194598888e-05, "loss": 3.1218, "step": 257500 }, { "epoch": 204.92, "learning_rate": 1.590150913423352e-05, "loss": 3.1198, "step": 258000 }, { "epoch": 205.0, "eval_loss": 2.9712555408477783, "eval_runtime": 66.7029, "eval_samples_per_second": 6439.379, "eval_steps_per_second": 2.099, "step": 258095 }, { "epoch": 205.32, "learning_rate": 1.589356632247816e-05, "loss": 3.1197, "step": 258500 }, { "epoch": 205.72, "learning_rate": 1.5885623510722797e-05, "loss": 3.1192, "step": 259000 }, { "epoch": 206.0, "eval_loss": 2.9651243686676025, "eval_runtime": 65.6816, "eval_samples_per_second": 6539.499, "eval_steps_per_second": 2.131, "step": 259354 }, { "epoch": 206.12, "learning_rate": 1.5877680698967435e-05, "loss": 3.1194, "step": 259500 }, { "epoch": 206.51, "learning_rate": 1.5869737887212073e-05, "loss": 3.1186, "step": 260000 }, { "epoch": 206.91, "learning_rate": 1.5861795075456715e-05, "loss": 3.1155, "step": 260500 }, { "epoch": 207.0, "eval_loss": 2.968662977218628, "eval_runtime": 65.5465, "eval_samples_per_second": 6552.985, "eval_steps_per_second": 2.136, "step": 260613 }, { "epoch": 207.31, "learning_rate": 1.5853852263701353e-05, "loss": 3.1141, "step": 261000 }, { "epoch": 207.7, "learning_rate": 1.584590945194599e-05, "loss": 3.1169, "step": 261500 }, { "epoch": 208.0, "eval_loss": 2.9656100273132324, "eval_runtime": 67.9585, "eval_samples_per_second": 6320.403, "eval_steps_per_second": 2.06, "step": 261872 }, { "epoch": 208.1, "learning_rate": 1.583796664019063e-05, "loss": 3.1172, "step": 262000 }, { "epoch": 208.5, "learning_rate": 1.5830023828435267e-05, "loss": 3.1138, "step": 262500 }, { "epoch": 208.9, "learning_rate": 1.5822081016679905e-05, "loss": 3.1121, "step": 263000 }, { "epoch": 209.0, "eval_loss": 2.958681583404541, "eval_runtime": 66.8558, "eval_samples_per_second": 6424.65, "eval_steps_per_second": 2.094, "step": 263131 }, { "epoch": 209.29, "learning_rate": 1.5814138204924547e-05, "loss": 3.1113, "step": 263500 }, { "epoch": 209.69, "learning_rate": 1.5806195393169185e-05, "loss": 3.1112, "step": 264000 }, { "epoch": 210.0, "eval_loss": 2.957751989364624, "eval_runtime": 65.9634, "eval_samples_per_second": 6511.564, "eval_steps_per_second": 2.122, "step": 264390 }, { "epoch": 210.09, "learning_rate": 1.5798252581413823e-05, "loss": 3.1125, "step": 264500 }, { "epoch": 210.48, "learning_rate": 1.579030976965846e-05, "loss": 3.1108, "step": 265000 }, { "epoch": 210.88, "learning_rate": 1.57823669579031e-05, "loss": 3.1103, "step": 265500 }, { "epoch": 211.0, "eval_loss": 2.958056688308716, "eval_runtime": 65.7149, "eval_samples_per_second": 6536.188, "eval_steps_per_second": 2.13, "step": 265649 }, { "epoch": 211.28, "learning_rate": 1.5774424146147737e-05, "loss": 3.1132, "step": 266000 }, { "epoch": 211.68, "learning_rate": 1.5766481334392375e-05, "loss": 3.1067, "step": 266500 }, { "epoch": 212.0, "eval_loss": 2.9551777839660645, "eval_runtime": 66.1916, "eval_samples_per_second": 6489.115, "eval_steps_per_second": 2.115, "step": 266908 }, { "epoch": 212.07, "learning_rate": 1.5758538522637013e-05, "loss": 3.111, "step": 267000 }, { "epoch": 212.47, "learning_rate": 1.575059571088165e-05, "loss": 3.1079, "step": 267500 }, { "epoch": 212.87, "learning_rate": 1.5742652899126293e-05, "loss": 3.1072, "step": 268000 }, { "epoch": 213.0, "eval_loss": 2.955286741256714, "eval_runtime": 65.635, "eval_samples_per_second": 6544.15, "eval_steps_per_second": 2.133, "step": 268167 }, { "epoch": 213.26, "learning_rate": 1.573471008737093e-05, "loss": 3.105, "step": 268500 }, { "epoch": 213.66, "learning_rate": 1.572676727561557e-05, "loss": 3.1064, "step": 269000 }, { "epoch": 214.0, "eval_loss": 2.958470582962036, "eval_runtime": 67.6125, "eval_samples_per_second": 6352.746, "eval_steps_per_second": 2.071, "step": 269426 }, { "epoch": 214.06, "learning_rate": 1.5718824463860207e-05, "loss": 3.106, "step": 269500 }, { "epoch": 214.46, "learning_rate": 1.5710881652104845e-05, "loss": 3.1057, "step": 270000 }, { "epoch": 214.85, "learning_rate": 1.5702938840349484e-05, "loss": 3.1024, "step": 270500 }, { "epoch": 215.0, "eval_loss": 2.9487507343292236, "eval_runtime": 64.4952, "eval_samples_per_second": 6659.796, "eval_steps_per_second": 2.171, "step": 270685 }, { "epoch": 215.25, "learning_rate": 1.569499602859412e-05, "loss": 3.1035, "step": 271000 }, { "epoch": 215.65, "learning_rate": 1.568705321683876e-05, "loss": 3.105, "step": 271500 }, { "epoch": 216.0, "eval_loss": 2.9549643993377686, "eval_runtime": 66.8525, "eval_samples_per_second": 6424.966, "eval_steps_per_second": 2.094, "step": 271944 }, { "epoch": 216.04, "learning_rate": 1.56791104050834e-05, "loss": 3.1023, "step": 272000 }, { "epoch": 216.44, "learning_rate": 1.567116759332804e-05, "loss": 3.1029, "step": 272500 }, { "epoch": 216.84, "learning_rate": 1.5663224781572678e-05, "loss": 3.0996, "step": 273000 }, { "epoch": 217.0, "eval_loss": 2.949592351913452, "eval_runtime": 65.8464, "eval_samples_per_second": 6523.136, "eval_steps_per_second": 2.126, "step": 273203 }, { "epoch": 217.24, "learning_rate": 1.565528196981732e-05, "loss": 3.0996, "step": 273500 }, { "epoch": 217.63, "learning_rate": 1.5647339158061957e-05, "loss": 3.0997, "step": 274000 }, { "epoch": 218.0, "eval_loss": 2.9485127925872803, "eval_runtime": 65.4708, "eval_samples_per_second": 6560.562, "eval_steps_per_second": 2.138, "step": 274462 }, { "epoch": 218.03, "learning_rate": 1.5639396346306595e-05, "loss": 3.0978, "step": 274500 }, { "epoch": 218.43, "learning_rate": 1.5631453534551233e-05, "loss": 3.0979, "step": 275000 }, { "epoch": 218.82, "learning_rate": 1.562351072279587e-05, "loss": 3.0964, "step": 275500 }, { "epoch": 219.0, "eval_loss": 2.947661876678467, "eval_runtime": 66.9185, "eval_samples_per_second": 6418.63, "eval_steps_per_second": 2.092, "step": 275721 }, { "epoch": 219.22, "learning_rate": 1.561556791104051e-05, "loss": 3.0968, "step": 276000 }, { "epoch": 219.62, "learning_rate": 1.5607625099285148e-05, "loss": 3.0947, "step": 276500 }, { "epoch": 220.0, "eval_loss": 2.9446866512298584, "eval_runtime": 66.1148, "eval_samples_per_second": 6496.654, "eval_steps_per_second": 2.118, "step": 276980 }, { "epoch": 220.02, "learning_rate": 1.5599682287529786e-05, "loss": 3.0961, "step": 277000 }, { "epoch": 220.41, "learning_rate": 1.5591739475774427e-05, "loss": 3.094, "step": 277500 }, { "epoch": 220.81, "learning_rate": 1.5583796664019065e-05, "loss": 3.0966, "step": 278000 }, { "epoch": 221.0, "eval_loss": 2.9384045600891113, "eval_runtime": 67.3818, "eval_samples_per_second": 6374.5, "eval_steps_per_second": 2.078, "step": 278239 }, { "epoch": 221.21, "learning_rate": 1.5575853852263704e-05, "loss": 3.0918, "step": 278500 }, { "epoch": 221.6, "learning_rate": 1.556791104050834e-05, "loss": 3.0944, "step": 279000 }, { "epoch": 222.0, "eval_loss": 2.9392666816711426, "eval_runtime": 64.9736, "eval_samples_per_second": 6610.765, "eval_steps_per_second": 2.155, "step": 279498 }, { "epoch": 222.0, "learning_rate": 1.555996822875298e-05, "loss": 3.0944, "step": 279500 }, { "epoch": 222.4, "learning_rate": 1.5552025416997618e-05, "loss": 3.0916, "step": 280000 }, { "epoch": 222.8, "learning_rate": 1.5544082605242256e-05, "loss": 3.0928, "step": 280500 }, { "epoch": 223.0, "eval_loss": 2.942800521850586, "eval_runtime": 65.8225, "eval_samples_per_second": 6525.505, "eval_steps_per_second": 2.127, "step": 280757 }, { "epoch": 223.19, "learning_rate": 1.5536139793486894e-05, "loss": 3.0899, "step": 281000 }, { "epoch": 223.59, "learning_rate": 1.5528196981731532e-05, "loss": 3.0901, "step": 281500 }, { "epoch": 223.99, "learning_rate": 1.5520254169976174e-05, "loss": 3.0875, "step": 282000 }, { "epoch": 224.0, "eval_loss": 2.9388368129730225, "eval_runtime": 65.5563, "eval_samples_per_second": 6552.001, "eval_steps_per_second": 2.136, "step": 282016 }, { "epoch": 224.38, "learning_rate": 1.5512311358220812e-05, "loss": 3.0873, "step": 282500 }, { "epoch": 224.78, "learning_rate": 1.550436854646545e-05, "loss": 3.0894, "step": 283000 }, { "epoch": 225.0, "eval_loss": 2.935002326965332, "eval_runtime": 65.7054, "eval_samples_per_second": 6537.135, "eval_steps_per_second": 2.131, "step": 283275 }, { "epoch": 225.18, "learning_rate": 1.5496425734710088e-05, "loss": 3.0871, "step": 283500 }, { "epoch": 225.58, "learning_rate": 1.5488482922954726e-05, "loss": 3.0872, "step": 284000 }, { "epoch": 225.97, "learning_rate": 1.5480540111199368e-05, "loss": 3.0901, "step": 284500 }, { "epoch": 226.0, "eval_loss": 2.928438663482666, "eval_runtime": 66.2369, "eval_samples_per_second": 6484.675, "eval_steps_per_second": 2.114, "step": 284534 }, { "epoch": 226.37, "learning_rate": 1.5472597299444006e-05, "loss": 3.0848, "step": 285000 }, { "epoch": 226.77, "learning_rate": 1.5464654487688644e-05, "loss": 3.0869, "step": 285500 }, { "epoch": 227.0, "eval_loss": 2.9335274696350098, "eval_runtime": 65.9383, "eval_samples_per_second": 6514.041, "eval_steps_per_second": 2.123, "step": 285793 }, { "epoch": 227.16, "learning_rate": 1.5456711675933282e-05, "loss": 3.0847, "step": 286000 }, { "epoch": 227.56, "learning_rate": 1.544876886417792e-05, "loss": 3.0857, "step": 286500 }, { "epoch": 227.96, "learning_rate": 1.5440826052422558e-05, "loss": 3.0828, "step": 287000 }, { "epoch": 228.0, "eval_loss": 2.9316186904907227, "eval_runtime": 65.5003, "eval_samples_per_second": 6557.607, "eval_steps_per_second": 2.137, "step": 287052 }, { "epoch": 228.36, "learning_rate": 1.54328832406672e-05, "loss": 3.0808, "step": 287500 }, { "epoch": 228.75, "learning_rate": 1.5424940428911838e-05, "loss": 3.0793, "step": 288000 }, { "epoch": 229.0, "eval_loss": 2.93403959274292, "eval_runtime": 66.0367, "eval_samples_per_second": 6504.337, "eval_steps_per_second": 2.12, "step": 288311 }, { "epoch": 229.15, "learning_rate": 1.5416997617156476e-05, "loss": 3.0833, "step": 288500 }, { "epoch": 229.55, "learning_rate": 1.5409054805401114e-05, "loss": 3.0831, "step": 289000 }, { "epoch": 229.94, "learning_rate": 1.5401111993645752e-05, "loss": 3.082, "step": 289500 }, { "epoch": 230.0, "eval_loss": 2.928305149078369, "eval_runtime": 66.8843, "eval_samples_per_second": 6421.911, "eval_steps_per_second": 2.093, "step": 289570 }, { "epoch": 230.34, "learning_rate": 1.539316918189039e-05, "loss": 3.0808, "step": 290000 }, { "epoch": 230.74, "learning_rate": 1.538522637013503e-05, "loss": 3.08, "step": 290500 }, { "epoch": 231.0, "eval_loss": 2.9264955520629883, "eval_runtime": 66.1243, "eval_samples_per_second": 6495.724, "eval_steps_per_second": 2.117, "step": 290829 }, { "epoch": 231.14, "learning_rate": 1.5377283558379666e-05, "loss": 3.0774, "step": 291000 }, { "epoch": 231.53, "learning_rate": 1.5369340746624305e-05, "loss": 3.0792, "step": 291500 }, { "epoch": 231.93, "learning_rate": 1.5361397934868946e-05, "loss": 3.0782, "step": 292000 }, { "epoch": 232.0, "eval_loss": 2.9241137504577637, "eval_runtime": 67.3701, "eval_samples_per_second": 6375.599, "eval_steps_per_second": 2.078, "step": 292088 }, { "epoch": 232.33, "learning_rate": 1.5353455123113584e-05, "loss": 3.0753, "step": 292500 }, { "epoch": 232.72, "learning_rate": 1.5345512311358222e-05, "loss": 3.0759, "step": 293000 }, { "epoch": 233.0, "eval_loss": 2.922394275665283, "eval_runtime": 66.1924, "eval_samples_per_second": 6489.039, "eval_steps_per_second": 2.115, "step": 293347 }, { "epoch": 233.12, "learning_rate": 1.533756949960286e-05, "loss": 3.0761, "step": 293500 }, { "epoch": 233.52, "learning_rate": 1.53296266878475e-05, "loss": 3.0745, "step": 294000 }, { "epoch": 233.92, "learning_rate": 1.5321683876092137e-05, "loss": 3.0738, "step": 294500 }, { "epoch": 234.0, "eval_loss": 2.9249184131622314, "eval_runtime": 66.5084, "eval_samples_per_second": 6458.212, "eval_steps_per_second": 2.105, "step": 294606 }, { "epoch": 234.31, "learning_rate": 1.5313741064336775e-05, "loss": 3.0734, "step": 295000 }, { "epoch": 234.71, "learning_rate": 1.5305798252581413e-05, "loss": 3.0716, "step": 295500 }, { "epoch": 235.0, "eval_loss": 2.921541929244995, "eval_runtime": 67.1139, "eval_samples_per_second": 6399.944, "eval_steps_per_second": 2.086, "step": 295865 }, { "epoch": 235.11, "learning_rate": 1.5297855440826054e-05, "loss": 3.0746, "step": 296000 }, { "epoch": 235.5, "learning_rate": 1.5289912629070692e-05, "loss": 3.0684, "step": 296500 }, { "epoch": 235.9, "learning_rate": 1.528196981731533e-05, "loss": 3.0731, "step": 297000 }, { "epoch": 236.0, "eval_loss": 2.919463634490967, "eval_runtime": 69.9648, "eval_samples_per_second": 6139.16, "eval_steps_per_second": 2.001, "step": 297124 }, { "epoch": 236.3, "learning_rate": 1.5274027005559972e-05, "loss": 3.0719, "step": 297500 }, { "epoch": 236.7, "learning_rate": 1.526608419380461e-05, "loss": 3.0719, "step": 298000 }, { "epoch": 237.0, "eval_loss": 2.9221296310424805, "eval_runtime": 67.1119, "eval_samples_per_second": 6400.127, "eval_steps_per_second": 2.086, "step": 298383 }, { "epoch": 237.09, "learning_rate": 1.5258141382049248e-05, "loss": 3.0695, "step": 298500 }, { "epoch": 237.49, "learning_rate": 1.5250198570293886e-05, "loss": 3.0716, "step": 299000 }, { "epoch": 237.89, "learning_rate": 1.5242255758538524e-05, "loss": 3.0661, "step": 299500 }, { "epoch": 238.0, "eval_loss": 2.914468288421631, "eval_runtime": 64.8257, "eval_samples_per_second": 6625.844, "eval_steps_per_second": 2.16, "step": 299642 }, { "epoch": 238.28, "learning_rate": 1.5234312946783163e-05, "loss": 3.0688, "step": 300000 }, { "epoch": 238.68, "learning_rate": 1.52263701350278e-05, "loss": 3.0693, "step": 300500 }, { "epoch": 239.0, "eval_loss": 2.9148149490356445, "eval_runtime": 69.3108, "eval_samples_per_second": 6197.087, "eval_steps_per_second": 2.02, "step": 300901 }, { "epoch": 239.08, "learning_rate": 1.521842732327244e-05, "loss": 3.0673, "step": 301000 }, { "epoch": 239.48, "learning_rate": 1.5210484511517079e-05, "loss": 3.0657, "step": 301500 }, { "epoch": 239.87, "learning_rate": 1.5202541699761717e-05, "loss": 3.0656, "step": 302000 }, { "epoch": 240.0, "eval_loss": 2.9158501625061035, "eval_runtime": 66.0293, "eval_samples_per_second": 6505.067, "eval_steps_per_second": 2.12, "step": 302160 }, { "epoch": 240.27, "learning_rate": 1.5194598888006355e-05, "loss": 3.0658, "step": 302500 }, { "epoch": 240.67, "learning_rate": 1.5186656076250995e-05, "loss": 3.0645, "step": 303000 }, { "epoch": 241.0, "eval_loss": 2.9138288497924805, "eval_runtime": 66.8003, "eval_samples_per_second": 6429.982, "eval_steps_per_second": 2.096, "step": 303419 }, { "epoch": 241.06, "learning_rate": 1.5178713264495633e-05, "loss": 3.0659, "step": 303500 }, { "epoch": 241.46, "learning_rate": 1.517077045274027e-05, "loss": 3.0661, "step": 304000 }, { "epoch": 241.86, "learning_rate": 1.5162827640984909e-05, "loss": 3.0617, "step": 304500 }, { "epoch": 242.0, "eval_loss": 2.9083235263824463, "eval_runtime": 66.923, "eval_samples_per_second": 6418.2, "eval_steps_per_second": 2.092, "step": 304678 }, { "epoch": 242.26, "learning_rate": 1.5154884829229549e-05, "loss": 3.0654, "step": 305000 }, { "epoch": 242.65, "learning_rate": 1.5146942017474187e-05, "loss": 3.0632, "step": 305500 }, { "epoch": 243.0, "eval_loss": 2.9134445190429688, "eval_runtime": 65.7666, "eval_samples_per_second": 6531.052, "eval_steps_per_second": 2.129, "step": 305937 }, { "epoch": 243.05, "learning_rate": 1.5138999205718825e-05, "loss": 3.0611, "step": 306000 }, { "epoch": 243.45, "learning_rate": 1.5131056393963463e-05, "loss": 3.0622, "step": 306500 }, { "epoch": 243.84, "learning_rate": 1.5123113582208103e-05, "loss": 3.0588, "step": 307000 }, { "epoch": 244.0, "eval_loss": 2.906571865081787, "eval_runtime": 67.644, "eval_samples_per_second": 6349.784, "eval_steps_per_second": 2.07, "step": 307196 }, { "epoch": 244.24, "learning_rate": 1.5115170770452741e-05, "loss": 3.062, "step": 307500 }, { "epoch": 244.64, "learning_rate": 1.5107227958697379e-05, "loss": 3.0556, "step": 308000 }, { "epoch": 245.0, "eval_loss": 2.911162853240967, "eval_runtime": 67.0284, "eval_samples_per_second": 6408.102, "eval_steps_per_second": 2.089, "step": 308455 }, { "epoch": 245.04, "learning_rate": 1.5099285146942017e-05, "loss": 3.0593, "step": 308500 }, { "epoch": 245.43, "learning_rate": 1.5091342335186659e-05, "loss": 3.0579, "step": 309000 }, { "epoch": 245.83, "learning_rate": 1.5083399523431297e-05, "loss": 3.0596, "step": 309500 }, { "epoch": 246.0, "eval_loss": 2.9101808071136475, "eval_runtime": 65.5592, "eval_samples_per_second": 6551.709, "eval_steps_per_second": 2.135, "step": 309714 }, { "epoch": 246.23, "learning_rate": 1.5075456711675935e-05, "loss": 3.0578, "step": 310000 }, { "epoch": 246.62, "learning_rate": 1.5067513899920573e-05, "loss": 3.0573, "step": 310500 }, { "epoch": 247.0, "eval_loss": 2.907625913619995, "eval_runtime": 64.7274, "eval_samples_per_second": 6635.912, "eval_steps_per_second": 2.163, "step": 310973 }, { "epoch": 247.02, "learning_rate": 1.5059571088165213e-05, "loss": 3.059, "step": 311000 }, { "epoch": 247.42, "learning_rate": 1.5051628276409851e-05, "loss": 3.0575, "step": 311500 }, { "epoch": 247.82, "learning_rate": 1.5043685464654489e-05, "loss": 3.0534, "step": 312000 }, { "epoch": 248.0, "eval_loss": 2.9042575359344482, "eval_runtime": 68.9607, "eval_samples_per_second": 6228.547, "eval_steps_per_second": 2.03, "step": 312232 }, { "epoch": 248.21, "learning_rate": 1.5035742652899127e-05, "loss": 3.0574, "step": 312500 }, { "epoch": 248.61, "learning_rate": 1.5027799841143767e-05, "loss": 3.0526, "step": 313000 }, { "epoch": 249.0, "eval_loss": 2.902890920639038, "eval_runtime": 69.7878, "eval_samples_per_second": 6154.73, "eval_steps_per_second": 2.006, "step": 313491 }, { "epoch": 249.01, "learning_rate": 1.5019857029388405e-05, "loss": 3.0552, "step": 313500 }, { "epoch": 249.4, "learning_rate": 1.5011914217633043e-05, "loss": 3.0516, "step": 314000 }, { "epoch": 249.8, "learning_rate": 1.5003971405877681e-05, "loss": 3.0536, "step": 314500 }, { "epoch": 250.0, "eval_loss": 2.9027199745178223, "eval_runtime": 66.3245, "eval_samples_per_second": 6476.11, "eval_steps_per_second": 2.111, "step": 314750 }, { "epoch": 250.2, "learning_rate": 1.4996028594122321e-05, "loss": 3.0508, "step": 315000 }, { "epoch": 250.6, "learning_rate": 1.498808578236696e-05, "loss": 3.0542, "step": 315500 }, { "epoch": 250.99, "learning_rate": 1.4980142970611597e-05, "loss": 3.0498, "step": 316000 }, { "epoch": 251.0, "eval_loss": 2.902456521987915, "eval_runtime": 67.7699, "eval_samples_per_second": 6337.989, "eval_steps_per_second": 2.066, "step": 316009 }, { "epoch": 251.39, "learning_rate": 1.4972200158856235e-05, "loss": 3.0509, "step": 316500 }, { "epoch": 251.79, "learning_rate": 1.4964257347100875e-05, "loss": 3.0525, "step": 317000 }, { "epoch": 252.0, "eval_loss": 2.9057700634002686, "eval_runtime": 68.4689, "eval_samples_per_second": 6273.29, "eval_steps_per_second": 2.045, "step": 317268 }, { "epoch": 252.18, "learning_rate": 1.4956314535345513e-05, "loss": 3.0525, "step": 317500 }, { "epoch": 252.58, "learning_rate": 1.4948371723590151e-05, "loss": 3.0489, "step": 318000 }, { "epoch": 252.98, "learning_rate": 1.494042891183479e-05, "loss": 3.0505, "step": 318500 }, { "epoch": 253.0, "eval_loss": 2.9030964374542236, "eval_runtime": 66.7285, "eval_samples_per_second": 6436.903, "eval_steps_per_second": 2.098, "step": 318527 }, { "epoch": 253.38, "learning_rate": 1.493248610007943e-05, "loss": 3.0494, "step": 319000 }, { "epoch": 253.77, "learning_rate": 1.4924543288324068e-05, "loss": 3.0489, "step": 319500 }, { "epoch": 254.0, "eval_loss": 2.8985490798950195, "eval_runtime": 64.5643, "eval_samples_per_second": 6652.671, "eval_steps_per_second": 2.168, "step": 319786 }, { "epoch": 254.17, "learning_rate": 1.4916600476568706e-05, "loss": 3.0494, "step": 320000 }, { "epoch": 254.57, "learning_rate": 1.4908657664813344e-05, "loss": 3.0459, "step": 320500 }, { "epoch": 254.96, "learning_rate": 1.4900714853057985e-05, "loss": 3.0444, "step": 321000 }, { "epoch": 255.0, "eval_loss": 2.8965048789978027, "eval_runtime": 66.3055, "eval_samples_per_second": 6477.974, "eval_steps_per_second": 2.111, "step": 321045 }, { "epoch": 255.36, "learning_rate": 1.4892772041302623e-05, "loss": 3.0463, "step": 321500 }, { "epoch": 255.76, "learning_rate": 1.4884829229547261e-05, "loss": 3.0447, "step": 322000 }, { "epoch": 256.0, "eval_loss": 2.8916826248168945, "eval_runtime": 64.7963, "eval_samples_per_second": 6628.851, "eval_steps_per_second": 2.161, "step": 322304 }, { "epoch": 256.16, "learning_rate": 1.48768864177919e-05, "loss": 3.0459, "step": 322500 }, { "epoch": 256.55, "learning_rate": 1.486894360603654e-05, "loss": 3.0467, "step": 323000 }, { "epoch": 256.95, "learning_rate": 1.4861000794281177e-05, "loss": 3.0408, "step": 323500 }, { "epoch": 257.0, "eval_loss": 2.892148017883301, "eval_runtime": 65.673, "eval_samples_per_second": 6540.364, "eval_steps_per_second": 2.132, "step": 323563 }, { "epoch": 257.35, "learning_rate": 1.4853057982525816e-05, "loss": 3.0442, "step": 324000 }, { "epoch": 257.74, "learning_rate": 1.4845115170770454e-05, "loss": 3.0442, "step": 324500 }, { "epoch": 258.0, "eval_loss": 2.8967716693878174, "eval_runtime": 69.1015, "eval_samples_per_second": 6215.854, "eval_steps_per_second": 2.026, "step": 324822 }, { "epoch": 258.14, "learning_rate": 1.4837172359015093e-05, "loss": 3.0407, "step": 325000 }, { "epoch": 258.54, "learning_rate": 1.4829229547259732e-05, "loss": 3.043, "step": 325500 }, { "epoch": 258.94, "learning_rate": 1.482128673550437e-05, "loss": 3.0424, "step": 326000 }, { "epoch": 259.0, "eval_loss": 2.8943562507629395, "eval_runtime": 64.984, "eval_samples_per_second": 6609.702, "eval_steps_per_second": 2.154, "step": 326081 }, { "epoch": 259.33, "learning_rate": 1.4813343923749008e-05, "loss": 3.0439, "step": 326500 }, { "epoch": 259.73, "learning_rate": 1.4805401111993648e-05, "loss": 3.0401, "step": 327000 }, { "epoch": 260.0, "eval_loss": 2.8892300128936768, "eval_runtime": 67.3898, "eval_samples_per_second": 6373.743, "eval_steps_per_second": 2.077, "step": 327340 }, { "epoch": 260.13, "learning_rate": 1.4797458300238286e-05, "loss": 3.0409, "step": 327500 }, { "epoch": 260.52, "learning_rate": 1.4789515488482924e-05, "loss": 3.039, "step": 328000 }, { "epoch": 260.92, "learning_rate": 1.4781572676727562e-05, "loss": 3.0377, "step": 328500 }, { "epoch": 261.0, "eval_loss": 2.8880300521850586, "eval_runtime": 68.4018, "eval_samples_per_second": 6279.44, "eval_steps_per_second": 2.047, "step": 328599 }, { "epoch": 261.32, "learning_rate": 1.4773629864972202e-05, "loss": 3.0378, "step": 329000 }, { "epoch": 261.72, "learning_rate": 1.476568705321684e-05, "loss": 3.0381, "step": 329500 }, { "epoch": 262.0, "eval_loss": 2.8854403495788574, "eval_runtime": 65.8924, "eval_samples_per_second": 6518.583, "eval_steps_per_second": 2.125, "step": 329858 }, { "epoch": 262.11, "learning_rate": 1.4757744241461478e-05, "loss": 3.0363, "step": 330000 }, { "epoch": 262.51, "learning_rate": 1.4749801429706116e-05, "loss": 3.0374, "step": 330500 }, { "epoch": 262.91, "learning_rate": 1.4741858617950754e-05, "loss": 3.036, "step": 331000 }, { "epoch": 263.0, "eval_loss": 2.885233163833618, "eval_runtime": 67.2156, "eval_samples_per_second": 6390.256, "eval_steps_per_second": 2.083, "step": 331117 }, { "epoch": 263.3, "learning_rate": 1.4733915806195394e-05, "loss": 3.0349, "step": 331500 }, { "epoch": 263.7, "learning_rate": 1.4725972994440032e-05, "loss": 3.0379, "step": 332000 }, { "epoch": 264.0, "eval_loss": 2.8814380168914795, "eval_runtime": 65.6307, "eval_samples_per_second": 6544.577, "eval_steps_per_second": 2.133, "step": 332376 }, { "epoch": 264.1, "learning_rate": 1.471803018268467e-05, "loss": 3.0376, "step": 332500 }, { "epoch": 264.5, "learning_rate": 1.4710087370929312e-05, "loss": 3.034, "step": 333000 }, { "epoch": 264.89, "learning_rate": 1.470214455917395e-05, "loss": 3.0366, "step": 333500 }, { "epoch": 265.0, "eval_loss": 2.8846473693847656, "eval_runtime": 66.4993, "eval_samples_per_second": 6459.092, "eval_steps_per_second": 2.105, "step": 333635 }, { "epoch": 265.29, "learning_rate": 1.4694201747418588e-05, "loss": 3.0325, "step": 334000 }, { "epoch": 265.69, "learning_rate": 1.4686258935663226e-05, "loss": 3.0338, "step": 334500 }, { "epoch": 266.0, "eval_loss": 2.877300262451172, "eval_runtime": 65.7285, "eval_samples_per_second": 6534.841, "eval_steps_per_second": 2.13, "step": 334894 }, { "epoch": 266.08, "learning_rate": 1.4678316123907866e-05, "loss": 3.0338, "step": 335000 }, { "epoch": 266.48, "learning_rate": 1.4670373312152504e-05, "loss": 3.0343, "step": 335500 }, { "epoch": 266.88, "learning_rate": 1.4662430500397142e-05, "loss": 3.0309, "step": 336000 }, { "epoch": 267.0, "eval_loss": 2.883594036102295, "eval_runtime": 67.4777, "eval_samples_per_second": 6365.432, "eval_steps_per_second": 2.075, "step": 336153 }, { "epoch": 267.28, "learning_rate": 1.465448768864178e-05, "loss": 3.0316, "step": 336500 }, { "epoch": 267.67, "learning_rate": 1.464654487688642e-05, "loss": 3.0275, "step": 337000 }, { "epoch": 268.0, "eval_loss": 2.876830577850342, "eval_runtime": 65.3196, "eval_samples_per_second": 6575.748, "eval_steps_per_second": 2.143, "step": 337412 }, { "epoch": 268.07, "learning_rate": 1.4638602065131058e-05, "loss": 3.0286, "step": 337500 }, { "epoch": 268.47, "learning_rate": 1.4630659253375696e-05, "loss": 3.0292, "step": 338000 }, { "epoch": 268.86, "learning_rate": 1.4622716441620334e-05, "loss": 3.0301, "step": 338500 }, { "epoch": 269.0, "eval_loss": 2.877840042114258, "eval_runtime": 66.0729, "eval_samples_per_second": 6500.771, "eval_steps_per_second": 2.119, "step": 338671 }, { "epoch": 269.26, "learning_rate": 1.4614773629864974e-05, "loss": 3.031, "step": 339000 }, { "epoch": 269.66, "learning_rate": 1.4606830818109612e-05, "loss": 3.0293, "step": 339500 }, { "epoch": 270.0, "eval_loss": 2.873568296432495, "eval_runtime": 64.9212, "eval_samples_per_second": 6616.098, "eval_steps_per_second": 2.156, "step": 339930 }, { "epoch": 270.06, "learning_rate": 1.459888800635425e-05, "loss": 3.0262, "step": 340000 }, { "epoch": 270.45, "learning_rate": 1.4590945194598888e-05, "loss": 3.0278, "step": 340500 }, { "epoch": 270.85, "learning_rate": 1.4583002382843527e-05, "loss": 3.0256, "step": 341000 }, { "epoch": 271.0, "eval_loss": 2.8720202445983887, "eval_runtime": 67.2011, "eval_samples_per_second": 6391.64, "eval_steps_per_second": 2.083, "step": 341189 }, { "epoch": 271.25, "learning_rate": 1.4575059571088166e-05, "loss": 3.0262, "step": 341500 }, { "epoch": 271.64, "learning_rate": 1.4567116759332804e-05, "loss": 3.0277, "step": 342000 }, { "epoch": 272.0, "eval_loss": 2.8822312355041504, "eval_runtime": 64.9726, "eval_samples_per_second": 6610.867, "eval_steps_per_second": 2.155, "step": 342448 }, { "epoch": 272.04, "learning_rate": 1.4559173947577443e-05, "loss": 3.0266, "step": 342500 }, { "epoch": 272.44, "learning_rate": 1.455123113582208e-05, "loss": 3.0266, "step": 343000 }, { "epoch": 272.84, "learning_rate": 1.454328832406672e-05, "loss": 3.0233, "step": 343500 }, { "epoch": 273.0, "eval_loss": 2.872910737991333, "eval_runtime": 66.4732, "eval_samples_per_second": 6461.622, "eval_steps_per_second": 2.106, "step": 343707 }, { "epoch": 273.23, "learning_rate": 1.4535345512311359e-05, "loss": 3.0241, "step": 344000 }, { "epoch": 273.63, "learning_rate": 1.4527402700555997e-05, "loss": 3.0262, "step": 344500 }, { "epoch": 274.0, "eval_loss": 2.868974208831787, "eval_runtime": 65.197, "eval_samples_per_second": 6588.114, "eval_steps_per_second": 2.147, "step": 344966 }, { "epoch": 274.03, "learning_rate": 1.4519459888800635e-05, "loss": 3.0238, "step": 345000 }, { "epoch": 274.42, "learning_rate": 1.4511517077045276e-05, "loss": 3.0261, "step": 345500 }, { "epoch": 274.82, "learning_rate": 1.4503574265289914e-05, "loss": 3.0236, "step": 346000 }, { "epoch": 275.0, "eval_loss": 2.8751070499420166, "eval_runtime": 65.8738, "eval_samples_per_second": 6520.42, "eval_steps_per_second": 2.125, "step": 346225 }, { "epoch": 275.22, "learning_rate": 1.4495631453534553e-05, "loss": 3.0215, "step": 346500 }, { "epoch": 275.62, "learning_rate": 1.4487688641779192e-05, "loss": 3.0234, "step": 347000 }, { "epoch": 276.0, "eval_loss": 2.872894763946533, "eval_runtime": 65.0967, "eval_samples_per_second": 6598.264, "eval_steps_per_second": 2.151, "step": 347484 }, { "epoch": 276.01, "learning_rate": 1.447974583002383e-05, "loss": 3.0216, "step": 347500 }, { "epoch": 276.41, "learning_rate": 1.4471803018268469e-05, "loss": 3.0209, "step": 348000 }, { "epoch": 276.81, "learning_rate": 1.4463860206513107e-05, "loss": 3.0218, "step": 348500 }, { "epoch": 277.0, "eval_loss": 2.8744547367095947, "eval_runtime": 67.0142, "eval_samples_per_second": 6409.464, "eval_steps_per_second": 2.089, "step": 348743 }, { "epoch": 277.2, "learning_rate": 1.4455917394757747e-05, "loss": 3.0217, "step": 349000 }, { "epoch": 277.6, "learning_rate": 1.4447974583002385e-05, "loss": 3.0183, "step": 349500 }, { "epoch": 278.0, "learning_rate": 1.4440031771247023e-05, "loss": 3.0197, "step": 350000 }, { "epoch": 278.0, "eval_loss": 2.8718974590301514, "eval_runtime": 65.0525, "eval_samples_per_second": 6602.744, "eval_steps_per_second": 2.152, "step": 350002 }, { "epoch": 278.4, "learning_rate": 1.443208895949166e-05, "loss": 3.0175, "step": 350500 }, { "epoch": 278.79, "learning_rate": 1.44241461477363e-05, "loss": 3.0205, "step": 351000 }, { "epoch": 279.0, "eval_loss": 2.8666226863861084, "eval_runtime": 65.6798, "eval_samples_per_second": 6539.683, "eval_steps_per_second": 2.132, "step": 351261 }, { "epoch": 279.19, "learning_rate": 1.4416203335980939e-05, "loss": 3.0188, "step": 351500 }, { "epoch": 279.59, "learning_rate": 1.4408260524225577e-05, "loss": 3.0176, "step": 352000 }, { "epoch": 279.98, "learning_rate": 1.4400317712470215e-05, "loss": 3.0196, "step": 352500 }, { "epoch": 280.0, "eval_loss": 2.864793062210083, "eval_runtime": 67.8936, "eval_samples_per_second": 6326.448, "eval_steps_per_second": 2.062, "step": 352520 }, { "epoch": 280.38, "learning_rate": 1.4392374900714853e-05, "loss": 3.0164, "step": 353000 }, { "epoch": 280.78, "learning_rate": 1.4384432088959493e-05, "loss": 3.0169, "step": 353500 }, { "epoch": 281.0, "eval_loss": 2.8629202842712402, "eval_runtime": 66.7156, "eval_samples_per_second": 6438.15, "eval_steps_per_second": 2.098, "step": 353779 }, { "epoch": 281.18, "learning_rate": 1.4376489277204131e-05, "loss": 3.0144, "step": 354000 }, { "epoch": 281.57, "learning_rate": 1.4368546465448769e-05, "loss": 3.014, "step": 354500 }, { "epoch": 281.97, "learning_rate": 1.4360603653693407e-05, "loss": 3.0171, "step": 355000 }, { "epoch": 282.0, "eval_loss": 2.8675363063812256, "eval_runtime": 65.3023, "eval_samples_per_second": 6577.489, "eval_steps_per_second": 2.144, "step": 355038 }, { "epoch": 282.37, "learning_rate": 1.4352660841938047e-05, "loss": 3.0144, "step": 355500 }, { "epoch": 282.76, "learning_rate": 1.4344718030182685e-05, "loss": 3.0148, "step": 356000 }, { "epoch": 283.0, "eval_loss": 2.860086679458618, "eval_runtime": 65.4278, "eval_samples_per_second": 6564.869, "eval_steps_per_second": 2.14, "step": 356297 }, { "epoch": 283.16, "learning_rate": 1.4336775218427323e-05, "loss": 3.0155, "step": 356500 }, { "epoch": 283.56, "learning_rate": 1.4328832406671961e-05, "loss": 3.0141, "step": 357000 }, { "epoch": 283.96, "learning_rate": 1.4320889594916603e-05, "loss": 3.0155, "step": 357500 }, { "epoch": 284.0, "eval_loss": 2.861309051513672, "eval_runtime": 67.3753, "eval_samples_per_second": 6375.112, "eval_steps_per_second": 2.078, "step": 357556 }, { "epoch": 284.35, "learning_rate": 1.4312946783161241e-05, "loss": 3.0132, "step": 358000 }, { "epoch": 284.75, "learning_rate": 1.4305003971405879e-05, "loss": 3.0126, "step": 358500 }, { "epoch": 285.0, "eval_loss": 2.8593084812164307, "eval_runtime": 65.1331, "eval_samples_per_second": 6594.57, "eval_steps_per_second": 2.149, "step": 358815 }, { "epoch": 285.15, "learning_rate": 1.4297061159650519e-05, "loss": 3.0123, "step": 359000 }, { "epoch": 285.54, "learning_rate": 1.4289118347895157e-05, "loss": 3.0097, "step": 359500 }, { "epoch": 285.94, "learning_rate": 1.4281175536139795e-05, "loss": 3.0138, "step": 360000 }, { "epoch": 286.0, "eval_loss": 2.8625035285949707, "eval_runtime": 65.4545, "eval_samples_per_second": 6562.191, "eval_steps_per_second": 2.139, "step": 360074 }, { "epoch": 286.34, "learning_rate": 1.4273232724384433e-05, "loss": 3.0112, "step": 360500 }, { "epoch": 286.74, "learning_rate": 1.4265289912629073e-05, "loss": 3.0088, "step": 361000 }, { "epoch": 287.0, "eval_loss": 2.8593151569366455, "eval_runtime": 65.8771, "eval_samples_per_second": 6520.091, "eval_steps_per_second": 2.125, "step": 361333 }, { "epoch": 287.13, "learning_rate": 1.4257347100873711e-05, "loss": 3.01, "step": 361500 }, { "epoch": 287.53, "learning_rate": 1.424940428911835e-05, "loss": 3.0091, "step": 362000 }, { "epoch": 287.93, "learning_rate": 1.4241461477362987e-05, "loss": 3.0088, "step": 362500 }, { "epoch": 288.0, "eval_loss": 2.862494945526123, "eval_runtime": 65.5606, "eval_samples_per_second": 6551.575, "eval_steps_per_second": 2.135, "step": 362592 }, { "epoch": 288.32, "learning_rate": 1.4233518665607625e-05, "loss": 3.0088, "step": 363000 }, { "epoch": 288.72, "learning_rate": 1.4225575853852265e-05, "loss": 3.0079, "step": 363500 }, { "epoch": 289.0, "eval_loss": 2.855443239212036, "eval_runtime": 68.0498, "eval_samples_per_second": 6311.92, "eval_steps_per_second": 2.057, "step": 363851 }, { "epoch": 289.12, "learning_rate": 1.4217633042096903e-05, "loss": 3.0064, "step": 364000 }, { "epoch": 289.52, "learning_rate": 1.4209690230341541e-05, "loss": 3.0095, "step": 364500 }, { "epoch": 289.91, "learning_rate": 1.420174741858618e-05, "loss": 3.0068, "step": 365000 }, { "epoch": 290.0, "eval_loss": 2.859919786453247, "eval_runtime": 69.1823, "eval_samples_per_second": 6208.6, "eval_steps_per_second": 2.024, "step": 365110 }, { "epoch": 290.31, "learning_rate": 1.419380460683082e-05, "loss": 3.0075, "step": 365500 }, { "epoch": 290.71, "learning_rate": 1.4185861795075458e-05, "loss": 3.004, "step": 366000 }, { "epoch": 291.0, "eval_loss": 2.8571276664733887, "eval_runtime": 66.1532, "eval_samples_per_second": 6492.881, "eval_steps_per_second": 2.116, "step": 366369 }, { "epoch": 291.1, "learning_rate": 1.4177918983320096e-05, "loss": 3.0069, "step": 366500 }, { "epoch": 291.5, "learning_rate": 1.4169976171564734e-05, "loss": 3.0059, "step": 367000 }, { "epoch": 291.9, "learning_rate": 1.4162033359809374e-05, "loss": 3.0064, "step": 367500 }, { "epoch": 292.0, "eval_loss": 2.8527328968048096, "eval_runtime": 65.8317, "eval_samples_per_second": 6524.591, "eval_steps_per_second": 2.127, "step": 367628 }, { "epoch": 292.3, "learning_rate": 1.4154090548054012e-05, "loss": 3.0065, "step": 368000 }, { "epoch": 292.69, "learning_rate": 1.414614773629865e-05, "loss": 3.0016, "step": 368500 }, { "epoch": 293.0, "eval_loss": 2.8553824424743652, "eval_runtime": 67.1849, "eval_samples_per_second": 6393.178, "eval_steps_per_second": 2.084, "step": 368887 }, { "epoch": 293.09, "learning_rate": 1.4138204924543288e-05, "loss": 3.005, "step": 369000 }, { "epoch": 293.49, "learning_rate": 1.413026211278793e-05, "loss": 3.002, "step": 369500 }, { "epoch": 293.88, "learning_rate": 1.4122319301032567e-05, "loss": 3.0046, "step": 370000 }, { "epoch": 294.0, "eval_loss": 2.8564395904541016, "eval_runtime": 66.1787, "eval_samples_per_second": 6490.383, "eval_steps_per_second": 2.115, "step": 370146 }, { "epoch": 294.28, "learning_rate": 1.4114376489277206e-05, "loss": 3.0027, "step": 370500 }, { "epoch": 294.68, "learning_rate": 1.4106433677521845e-05, "loss": 3.0023, "step": 371000 }, { "epoch": 295.0, "eval_loss": 2.8483004570007324, "eval_runtime": 67.967, "eval_samples_per_second": 6319.613, "eval_steps_per_second": 2.06, "step": 371405 }, { "epoch": 295.08, "learning_rate": 1.4098490865766483e-05, "loss": 3.0021, "step": 371500 }, { "epoch": 295.47, "learning_rate": 1.4090548054011122e-05, "loss": 2.9994, "step": 372000 }, { "epoch": 295.87, "learning_rate": 1.408260524225576e-05, "loss": 3.0022, "step": 372500 }, { "epoch": 296.0, "eval_loss": 2.8509414196014404, "eval_runtime": 65.5083, "eval_samples_per_second": 6556.803, "eval_steps_per_second": 2.137, "step": 372664 }, { "epoch": 296.27, "learning_rate": 1.40746624305004e-05, "loss": 2.9997, "step": 373000 }, { "epoch": 296.66, "learning_rate": 1.4066719618745038e-05, "loss": 3.0024, "step": 373500 }, { "epoch": 297.0, "eval_loss": 2.8466475009918213, "eval_runtime": 68.6001, "eval_samples_per_second": 6261.287, "eval_steps_per_second": 2.041, "step": 373923 }, { "epoch": 297.06, "learning_rate": 1.4058776806989676e-05, "loss": 3.001, "step": 374000 }, { "epoch": 297.46, "learning_rate": 1.4050833995234314e-05, "loss": 3.002, "step": 374500 }, { "epoch": 297.86, "learning_rate": 1.4042891183478952e-05, "loss": 3.0012, "step": 375000 }, { "epoch": 298.0, "eval_loss": 2.847892999649048, "eval_runtime": 66.6819, "eval_samples_per_second": 6441.404, "eval_steps_per_second": 2.1, "step": 375182 }, { "epoch": 298.25, "learning_rate": 1.4034948371723592e-05, "loss": 2.9997, "step": 375500 }, { "epoch": 298.65, "learning_rate": 1.402700555996823e-05, "loss": 2.9983, "step": 376000 }, { "epoch": 299.0, "eval_loss": 2.8509113788604736, "eval_runtime": 65.9832, "eval_samples_per_second": 6509.607, "eval_steps_per_second": 2.122, "step": 376441 }, { "epoch": 299.05, "learning_rate": 1.4019062748212868e-05, "loss": 2.9997, "step": 376500 }, { "epoch": 299.44, "learning_rate": 1.4011119936457506e-05, "loss": 2.9969, "step": 377000 }, { "epoch": 299.84, "learning_rate": 1.4003177124702146e-05, "loss": 2.997, "step": 377500 }, { "epoch": 300.0, "eval_loss": 2.8491482734680176, "eval_runtime": 66.0477, "eval_samples_per_second": 6503.257, "eval_steps_per_second": 2.12, "step": 377700 }, { "epoch": 300.24, "learning_rate": 1.3995234312946784e-05, "loss": 2.9991, "step": 378000 }, { "epoch": 300.64, "learning_rate": 1.3987291501191422e-05, "loss": 2.9979, "step": 378500 }, { "epoch": 301.0, "eval_loss": 2.8422210216522217, "eval_runtime": 67.5065, "eval_samples_per_second": 6362.72, "eval_steps_per_second": 2.074, "step": 378959 }, { "epoch": 301.03, "learning_rate": 1.397934868943606e-05, "loss": 2.9952, "step": 379000 }, { "epoch": 301.43, "learning_rate": 1.39714058776807e-05, "loss": 2.9966, "step": 379500 }, { "epoch": 301.83, "learning_rate": 1.3963463065925338e-05, "loss": 2.9945, "step": 380000 }, { "epoch": 302.0, "eval_loss": 2.8456003665924072, "eval_runtime": 67.179, "eval_samples_per_second": 6393.742, "eval_steps_per_second": 2.084, "step": 380218 }, { "epoch": 302.22, "learning_rate": 1.3955520254169976e-05, "loss": 2.9924, "step": 380500 }, { "epoch": 302.62, "learning_rate": 1.3947577442414614e-05, "loss": 2.9968, "step": 381000 }, { "epoch": 303.0, "eval_loss": 2.8428964614868164, "eval_runtime": 65.7309, "eval_samples_per_second": 6534.598, "eval_steps_per_second": 2.13, "step": 381477 }, { "epoch": 303.02, "learning_rate": 1.3939634630659256e-05, "loss": 2.9951, "step": 381500 }, { "epoch": 303.42, "learning_rate": 1.3931691818903894e-05, "loss": 2.9925, "step": 382000 }, { "epoch": 303.81, "learning_rate": 1.3923749007148532e-05, "loss": 2.994, "step": 382500 }, { "epoch": 304.0, "eval_loss": 2.8452248573303223, "eval_runtime": 66.0182, "eval_samples_per_second": 6506.158, "eval_steps_per_second": 2.121, "step": 382736 }, { "epoch": 304.21, "learning_rate": 1.3915806195393172e-05, "loss": 2.9933, "step": 383000 }, { "epoch": 304.61, "learning_rate": 1.390786338363781e-05, "loss": 2.9924, "step": 383500 }, { "epoch": 305.0, "eval_loss": 2.838834285736084, "eval_runtime": 65.7519, "eval_samples_per_second": 6532.508, "eval_steps_per_second": 2.129, "step": 383995 }, { "epoch": 305.0, "learning_rate": 1.3899920571882448e-05, "loss": 2.9936, "step": 384000 }, { "epoch": 305.4, "learning_rate": 1.3891977760127086e-05, "loss": 2.9921, "step": 384500 }, { "epoch": 305.8, "learning_rate": 1.3884034948371724e-05, "loss": 2.9869, "step": 385000 }, { "epoch": 306.0, "eval_loss": 2.837144613265991, "eval_runtime": 65.5648, "eval_samples_per_second": 6551.156, "eval_steps_per_second": 2.135, "step": 385254 }, { "epoch": 306.2, "learning_rate": 1.3876092136616364e-05, "loss": 2.9911, "step": 385500 }, { "epoch": 306.59, "learning_rate": 1.3868149324861002e-05, "loss": 2.9884, "step": 386000 }, { "epoch": 306.99, "learning_rate": 1.386020651310564e-05, "loss": 2.9893, "step": 386500 }, { "epoch": 307.0, "eval_loss": 2.8374884128570557, "eval_runtime": 67.0524, "eval_samples_per_second": 6405.814, "eval_steps_per_second": 2.088, "step": 386513 }, { "epoch": 307.39, "learning_rate": 1.3852263701350278e-05, "loss": 2.9915, "step": 387000 }, { "epoch": 307.78, "learning_rate": 1.3844320889594918e-05, "loss": 2.9889, "step": 387500 }, { "epoch": 308.0, "eval_loss": 2.840679883956909, "eval_runtime": 66.898, "eval_samples_per_second": 6420.598, "eval_steps_per_second": 2.093, "step": 387772 }, { "epoch": 308.18, "learning_rate": 1.3836378077839556e-05, "loss": 2.9892, "step": 388000 }, { "epoch": 308.58, "learning_rate": 1.3828435266084194e-05, "loss": 2.9899, "step": 388500 }, { "epoch": 308.98, "learning_rate": 1.3820492454328833e-05, "loss": 2.9887, "step": 389000 }, { "epoch": 309.0, "eval_loss": 2.8377134799957275, "eval_runtime": 66.6879, "eval_samples_per_second": 6440.825, "eval_steps_per_second": 2.099, "step": 389031 }, { "epoch": 309.37, "learning_rate": 1.3812549642573472e-05, "loss": 2.989, "step": 389500 }, { "epoch": 309.77, "learning_rate": 1.380460683081811e-05, "loss": 2.9872, "step": 390000 }, { "epoch": 310.0, "eval_loss": 2.8379697799682617, "eval_runtime": 66.0647, "eval_samples_per_second": 6501.585, "eval_steps_per_second": 2.119, "step": 390290 }, { "epoch": 310.17, "learning_rate": 1.3796664019062749e-05, "loss": 2.9861, "step": 390500 }, { "epoch": 310.56, "learning_rate": 1.3788721207307387e-05, "loss": 2.9864, "step": 391000 }, { "epoch": 310.96, "learning_rate": 1.3780778395552027e-05, "loss": 2.9859, "step": 391500 }, { "epoch": 311.0, "eval_loss": 2.836700439453125, "eval_runtime": 65.9647, "eval_samples_per_second": 6511.434, "eval_steps_per_second": 2.122, "step": 391549 }, { "epoch": 311.36, "learning_rate": 1.3772835583796665e-05, "loss": 2.9854, "step": 392000 }, { "epoch": 311.76, "learning_rate": 1.3764892772041303e-05, "loss": 2.9868, "step": 392500 }, { "epoch": 312.0, "eval_loss": 2.8314461708068848, "eval_runtime": 65.4076, "eval_samples_per_second": 6566.9, "eval_steps_per_second": 2.14, "step": 392808 }, { "epoch": 312.15, "learning_rate": 1.3756949960285941e-05, "loss": 2.983, "step": 393000 }, { "epoch": 312.55, "learning_rate": 1.3749007148530582e-05, "loss": 2.9882, "step": 393500 }, { "epoch": 312.95, "learning_rate": 1.374106433677522e-05, "loss": 2.981, "step": 394000 }, { "epoch": 313.0, "eval_loss": 2.8354291915893555, "eval_runtime": 66.9271, "eval_samples_per_second": 6417.801, "eval_steps_per_second": 2.092, "step": 394067 }, { "epoch": 313.34, "learning_rate": 1.3733121525019859e-05, "loss": 2.9836, "step": 394500 }, { "epoch": 313.74, "learning_rate": 1.3725178713264498e-05, "loss": 2.9835, "step": 395000 }, { "epoch": 314.0, "eval_loss": 2.833383321762085, "eval_runtime": 65.8481, "eval_samples_per_second": 6522.972, "eval_steps_per_second": 2.126, "step": 395326 }, { "epoch": 314.14, "learning_rate": 1.3717235901509137e-05, "loss": 2.9806, "step": 395500 }, { "epoch": 314.54, "learning_rate": 1.3709293089753775e-05, "loss": 2.9856, "step": 396000 }, { "epoch": 314.93, "learning_rate": 1.3701350277998413e-05, "loss": 2.983, "step": 396500 }, { "epoch": 315.0, "eval_loss": 2.836735725402832, "eval_runtime": 68.1233, "eval_samples_per_second": 6305.114, "eval_steps_per_second": 2.055, "step": 396585 }, { "epoch": 315.33, "learning_rate": 1.369340746624305e-05, "loss": 2.9821, "step": 397000 }, { "epoch": 315.73, "learning_rate": 1.368546465448769e-05, "loss": 2.9823, "step": 397500 }, { "epoch": 316.0, "eval_loss": 2.831509590148926, "eval_runtime": 67.1985, "eval_samples_per_second": 6391.886, "eval_steps_per_second": 2.083, "step": 397844 }, { "epoch": 316.12, "learning_rate": 1.3677521842732329e-05, "loss": 2.9826, "step": 398000 }, { "epoch": 316.52, "learning_rate": 1.3669579030976967e-05, "loss": 2.9833, "step": 398500 }, { "epoch": 316.92, "learning_rate": 1.3661636219221605e-05, "loss": 2.9801, "step": 399000 }, { "epoch": 317.0, "eval_loss": 2.831411838531494, "eval_runtime": 66.9734, "eval_samples_per_second": 6413.367, "eval_steps_per_second": 2.09, "step": 399103 }, { "epoch": 317.32, "learning_rate": 1.3653693407466245e-05, "loss": 2.9821, "step": 399500 }, { "epoch": 317.71, "learning_rate": 1.3645750595710883e-05, "loss": 2.9785, "step": 400000 }, { "epoch": 318.0, "eval_loss": 2.83496356010437, "eval_runtime": 66.5123, "eval_samples_per_second": 6457.827, "eval_steps_per_second": 2.105, "step": 400362 }, { "epoch": 318.11, "learning_rate": 1.3637807783955521e-05, "loss": 2.9787, "step": 400500 }, { "epoch": 318.51, "learning_rate": 1.3629864972200159e-05, "loss": 2.9809, "step": 401000 }, { "epoch": 318.9, "learning_rate": 1.3621922160444799e-05, "loss": 2.9791, "step": 401500 }, { "epoch": 319.0, "eval_loss": 2.829274892807007, "eval_runtime": 66.1718, "eval_samples_per_second": 6491.06, "eval_steps_per_second": 2.116, "step": 401621 }, { "epoch": 319.3, "learning_rate": 1.3613979348689437e-05, "loss": 2.9808, "step": 402000 }, { "epoch": 319.7, "learning_rate": 1.3606036536934075e-05, "loss": 2.9814, "step": 402500 }, { "epoch": 320.0, "eval_loss": 2.836120128631592, "eval_runtime": 65.1864, "eval_samples_per_second": 6589.178, "eval_steps_per_second": 2.148, "step": 402880 }, { "epoch": 320.1, "learning_rate": 1.3598093725178713e-05, "loss": 2.9769, "step": 403000 }, { "epoch": 320.49, "learning_rate": 1.3590150913423353e-05, "loss": 2.9781, "step": 403500 }, { "epoch": 320.89, "learning_rate": 1.3582208101667991e-05, "loss": 2.9812, "step": 404000 }, { "epoch": 321.0, "eval_loss": 2.8317503929138184, "eval_runtime": 66.6707, "eval_samples_per_second": 6442.488, "eval_steps_per_second": 2.1, "step": 404139 }, { "epoch": 321.29, "learning_rate": 1.357426528991263e-05, "loss": 2.977, "step": 404500 }, { "epoch": 321.68, "learning_rate": 1.3566322478157267e-05, "loss": 2.9781, "step": 405000 }, { "epoch": 322.0, "eval_loss": 2.8260960578918457, "eval_runtime": 67.3223, "eval_samples_per_second": 6380.129, "eval_steps_per_second": 2.08, "step": 405398 }, { "epoch": 322.08, "learning_rate": 1.3558379666401905e-05, "loss": 2.975, "step": 405500 }, { "epoch": 322.48, "learning_rate": 1.3550436854646547e-05, "loss": 2.9762, "step": 406000 }, { "epoch": 322.88, "learning_rate": 1.3542494042891185e-05, "loss": 2.9743, "step": 406500 }, { "epoch": 323.0, "eval_loss": 2.825716733932495, "eval_runtime": 65.6952, "eval_samples_per_second": 6538.149, "eval_steps_per_second": 2.131, "step": 406657 }, { "epoch": 323.27, "learning_rate": 1.3534551231135823e-05, "loss": 2.9762, "step": 407000 }, { "epoch": 323.67, "learning_rate": 1.3526608419380463e-05, "loss": 2.9731, "step": 407500 }, { "epoch": 324.0, "eval_loss": 2.8234710693359375, "eval_runtime": 66.5794, "eval_samples_per_second": 6451.315, "eval_steps_per_second": 2.103, "step": 407916 }, { "epoch": 324.07, "learning_rate": 1.3518665607625101e-05, "loss": 2.9765, "step": 408000 }, { "epoch": 324.46, "learning_rate": 1.351072279586974e-05, "loss": 2.9738, "step": 408500 }, { "epoch": 324.86, "learning_rate": 1.3502779984114377e-05, "loss": 2.9773, "step": 409000 }, { "epoch": 325.0, "eval_loss": 2.8231420516967773, "eval_runtime": 66.0838, "eval_samples_per_second": 6499.702, "eval_steps_per_second": 2.119, "step": 409175 }, { "epoch": 325.26, "learning_rate": 1.3494837172359017e-05, "loss": 2.9755, "step": 409500 }, { "epoch": 325.66, "learning_rate": 1.3486894360603655e-05, "loss": 2.9731, "step": 410000 }, { "epoch": 326.0, "eval_loss": 2.8258159160614014, "eval_runtime": 66.9989, "eval_samples_per_second": 6410.93, "eval_steps_per_second": 2.09, "step": 410434 }, { "epoch": 326.05, "learning_rate": 1.3478951548848293e-05, "loss": 2.9757, "step": 410500 }, { "epoch": 326.45, "learning_rate": 1.3471008737092931e-05, "loss": 2.973, "step": 411000 }, { "epoch": 326.85, "learning_rate": 1.3463065925337571e-05, "loss": 2.9727, "step": 411500 }, { "epoch": 327.0, "eval_loss": 2.823537826538086, "eval_runtime": 66.1523, "eval_samples_per_second": 6492.976, "eval_steps_per_second": 2.116, "step": 411693 }, { "epoch": 327.24, "learning_rate": 1.345512311358221e-05, "loss": 2.9715, "step": 412000 }, { "epoch": 327.64, "learning_rate": 1.3447180301826848e-05, "loss": 2.9715, "step": 412500 }, { "epoch": 328.0, "eval_loss": 2.8172554969787598, "eval_runtime": 68.8166, "eval_samples_per_second": 6241.589, "eval_steps_per_second": 2.034, "step": 412952 }, { "epoch": 328.04, "learning_rate": 1.3439237490071486e-05, "loss": 2.975, "step": 413000 }, { "epoch": 328.44, "learning_rate": 1.3431294678316125e-05, "loss": 2.9725, "step": 413500 }, { "epoch": 328.83, "learning_rate": 1.3423351866560764e-05, "loss": 2.9753, "step": 414000 }, { "epoch": 329.0, "eval_loss": 2.821531295776367, "eval_runtime": 67.321, "eval_samples_per_second": 6380.253, "eval_steps_per_second": 2.08, "step": 414211 }, { "epoch": 329.23, "learning_rate": 1.3415409054805402e-05, "loss": 2.9687, "step": 414500 }, { "epoch": 329.63, "learning_rate": 1.340746624305004e-05, "loss": 2.9697, "step": 415000 }, { "epoch": 330.0, "eval_loss": 2.8160367012023926, "eval_runtime": 65.0767, "eval_samples_per_second": 6600.286, "eval_steps_per_second": 2.151, "step": 415470 }, { "epoch": 330.02, "learning_rate": 1.3399523431294678e-05, "loss": 2.9693, "step": 415500 }, { "epoch": 330.42, "learning_rate": 1.3391580619539318e-05, "loss": 2.9698, "step": 416000 }, { "epoch": 330.82, "learning_rate": 1.3383637807783956e-05, "loss": 2.9704, "step": 416500 }, { "epoch": 331.0, "eval_loss": 2.8178412914276123, "eval_runtime": 67.7414, "eval_samples_per_second": 6340.655, "eval_steps_per_second": 2.067, "step": 416729 }, { "epoch": 331.22, "learning_rate": 1.3375694996028594e-05, "loss": 2.9684, "step": 417000 }, { "epoch": 331.61, "learning_rate": 1.3367752184273232e-05, "loss": 2.9711, "step": 417500 }, { "epoch": 332.0, "eval_loss": 2.819356918334961, "eval_runtime": 67.1771, "eval_samples_per_second": 6393.917, "eval_steps_per_second": 2.084, "step": 417988 }, { "epoch": 332.01, "learning_rate": 1.3359809372517873e-05, "loss": 2.9703, "step": 418000 }, { "epoch": 332.41, "learning_rate": 1.3351866560762512e-05, "loss": 2.969, "step": 418500 }, { "epoch": 332.8, "learning_rate": 1.334392374900715e-05, "loss": 2.9671, "step": 419000 }, { "epoch": 333.0, "eval_loss": 2.8221123218536377, "eval_runtime": 66.4115, "eval_samples_per_second": 6467.627, "eval_steps_per_second": 2.108, "step": 419247 }, { "epoch": 333.2, "learning_rate": 1.333598093725179e-05, "loss": 2.9671, "step": 419500 }, { "epoch": 333.6, "learning_rate": 1.3328038125496428e-05, "loss": 2.9672, "step": 420000 }, { "epoch": 334.0, "learning_rate": 1.3320095313741066e-05, "loss": 2.9666, "step": 420500 }, { "epoch": 334.0, "eval_loss": 2.8178720474243164, "eval_runtime": 67.0621, "eval_samples_per_second": 6404.883, "eval_steps_per_second": 2.088, "step": 420506 }, { "epoch": 334.39, "learning_rate": 1.3312152501985704e-05, "loss": 2.9657, "step": 421000 }, { "epoch": 334.79, "learning_rate": 1.3304209690230344e-05, "loss": 2.9673, "step": 421500 }, { "epoch": 335.0, "eval_loss": 2.8165531158447266, "eval_runtime": 66.3626, "eval_samples_per_second": 6472.392, "eval_steps_per_second": 2.11, "step": 421765 }, { "epoch": 335.19, "learning_rate": 1.3296266878474982e-05, "loss": 2.9667, "step": 422000 }, { "epoch": 335.58, "learning_rate": 1.328832406671962e-05, "loss": 2.9685, "step": 422500 }, { "epoch": 335.98, "learning_rate": 1.3280381254964258e-05, "loss": 2.9656, "step": 423000 }, { "epoch": 336.0, "eval_loss": 2.8189289569854736, "eval_runtime": 65.7, "eval_samples_per_second": 6537.667, "eval_steps_per_second": 2.131, "step": 423024 }, { "epoch": 336.38, "learning_rate": 1.3272438443208898e-05, "loss": 2.9647, "step": 423500 }, { "epoch": 336.78, "learning_rate": 1.3264495631453536e-05, "loss": 2.9659, "step": 424000 }, { "epoch": 337.0, "eval_loss": 2.816883087158203, "eval_runtime": 68.8949, "eval_samples_per_second": 6234.5, "eval_steps_per_second": 2.032, "step": 424283 }, { "epoch": 337.17, "learning_rate": 1.3256552819698174e-05, "loss": 2.9664, "step": 424500 }, { "epoch": 337.57, "learning_rate": 1.3248610007942812e-05, "loss": 2.9639, "step": 425000 }, { "epoch": 337.97, "learning_rate": 1.3240667196187452e-05, "loss": 2.9639, "step": 425500 }, { "epoch": 338.0, "eval_loss": 2.812180995941162, "eval_runtime": 66.9003, "eval_samples_per_second": 6420.377, "eval_steps_per_second": 2.093, "step": 425542 }, { "epoch": 338.36, "learning_rate": 1.323272438443209e-05, "loss": 2.9664, "step": 426000 }, { "epoch": 338.76, "learning_rate": 1.3224781572676728e-05, "loss": 2.9631, "step": 426500 }, { "epoch": 339.0, "eval_loss": 2.8168528079986572, "eval_runtime": 65.4625, "eval_samples_per_second": 6561.389, "eval_steps_per_second": 2.139, "step": 426801 }, { "epoch": 339.16, "learning_rate": 1.3216838760921366e-05, "loss": 2.9637, "step": 427000 }, { "epoch": 339.56, "learning_rate": 1.3208895949166004e-05, "loss": 2.9636, "step": 427500 }, { "epoch": 339.95, "learning_rate": 1.3200953137410644e-05, "loss": 2.9633, "step": 428000 }, { "epoch": 340.0, "eval_loss": 2.811854839324951, "eval_runtime": 68.3645, "eval_samples_per_second": 6282.867, "eval_steps_per_second": 2.048, "step": 428060 }, { "epoch": 340.35, "learning_rate": 1.3193010325655282e-05, "loss": 2.9604, "step": 428500 }, { "epoch": 340.75, "learning_rate": 1.318506751389992e-05, "loss": 2.9606, "step": 429000 }, { "epoch": 341.0, "eval_loss": 2.813464641571045, "eval_runtime": 68.5621, "eval_samples_per_second": 6264.756, "eval_steps_per_second": 2.042, "step": 429319 }, { "epoch": 341.14, "learning_rate": 1.3177124702144558e-05, "loss": 2.9611, "step": 429500 }, { "epoch": 341.54, "learning_rate": 1.31691818903892e-05, "loss": 2.9623, "step": 430000 }, { "epoch": 341.94, "learning_rate": 1.3161239078633838e-05, "loss": 2.9589, "step": 430500 }, { "epoch": 342.0, "eval_loss": 2.810549736022949, "eval_runtime": 66.7741, "eval_samples_per_second": 6432.511, "eval_steps_per_second": 2.097, "step": 430578 }, { "epoch": 342.34, "learning_rate": 1.3153296266878476e-05, "loss": 2.9619, "step": 431000 }, { "epoch": 342.73, "learning_rate": 1.3145353455123116e-05, "loss": 2.9586, "step": 431500 }, { "epoch": 343.0, "eval_loss": 2.8072454929351807, "eval_runtime": 65.9094, "eval_samples_per_second": 6516.896, "eval_steps_per_second": 2.124, "step": 431837 }, { "epoch": 343.13, "learning_rate": 1.3137410643367754e-05, "loss": 2.9589, "step": 432000 }, { "epoch": 343.53, "learning_rate": 1.3129467831612392e-05, "loss": 2.9603, "step": 432500 }, { "epoch": 343.92, "learning_rate": 1.312152501985703e-05, "loss": 2.961, "step": 433000 }, { "epoch": 344.0, "eval_loss": 2.808232545852661, "eval_runtime": 69.7601, "eval_samples_per_second": 6157.169, "eval_steps_per_second": 2.007, "step": 433096 }, { "epoch": 344.32, "learning_rate": 1.311358220810167e-05, "loss": 2.9579, "step": 433500 }, { "epoch": 344.72, "learning_rate": 1.3105639396346308e-05, "loss": 2.9598, "step": 434000 }, { "epoch": 345.0, "eval_loss": 2.810576915740967, "eval_runtime": 66.9967, "eval_samples_per_second": 6411.132, "eval_steps_per_second": 2.09, "step": 434355 }, { "epoch": 345.12, "learning_rate": 1.3097696584590946e-05, "loss": 2.9589, "step": 434500 }, { "epoch": 345.51, "learning_rate": 1.3089753772835584e-05, "loss": 2.9596, "step": 435000 }, { "epoch": 345.91, "learning_rate": 1.3081810961080224e-05, "loss": 2.9575, "step": 435500 }, { "epoch": 346.0, "eval_loss": 2.8040266036987305, "eval_runtime": 65.9275, "eval_samples_per_second": 6515.115, "eval_steps_per_second": 2.124, "step": 435614 }, { "epoch": 346.31, "learning_rate": 1.3073868149324862e-05, "loss": 2.9598, "step": 436000 }, { "epoch": 346.7, "learning_rate": 1.30659253375695e-05, "loss": 2.9555, "step": 436500 }, { "epoch": 347.0, "eval_loss": 2.8091964721679688, "eval_runtime": 65.7052, "eval_samples_per_second": 6537.159, "eval_steps_per_second": 2.131, "step": 436873 }, { "epoch": 347.1, "learning_rate": 1.3057982525814139e-05, "loss": 2.9545, "step": 437000 }, { "epoch": 347.5, "learning_rate": 1.3050039714058777e-05, "loss": 2.9549, "step": 437500 }, { "epoch": 347.9, "learning_rate": 1.3042096902303417e-05, "loss": 2.9556, "step": 438000 }, { "epoch": 348.0, "eval_loss": 2.807267427444458, "eval_runtime": 67.2363, "eval_samples_per_second": 6388.288, "eval_steps_per_second": 2.082, "step": 438132 }, { "epoch": 348.29, "learning_rate": 1.3034154090548055e-05, "loss": 2.9571, "step": 438500 }, { "epoch": 348.69, "learning_rate": 1.3026211278792693e-05, "loss": 2.9564, "step": 439000 }, { "epoch": 349.0, "eval_loss": 2.8013813495635986, "eval_runtime": 67.4067, "eval_samples_per_second": 6372.145, "eval_steps_per_second": 2.077, "step": 439391 }, { "epoch": 349.09, "learning_rate": 1.3018268467037331e-05, "loss": 2.9537, "step": 439500 }, { "epoch": 349.48, "learning_rate": 1.301032565528197e-05, "loss": 2.9544, "step": 440000 }, { "epoch": 349.88, "learning_rate": 1.3002382843526609e-05, "loss": 2.9543, "step": 440500 }, { "epoch": 350.0, "eval_loss": 2.8012876510620117, "eval_runtime": 65.5329, "eval_samples_per_second": 6554.34, "eval_steps_per_second": 2.136, "step": 440650 }, { "epoch": 350.28, "learning_rate": 1.2994440031771247e-05, "loss": 2.9549, "step": 441000 }, { "epoch": 350.68, "learning_rate": 1.2986497220015885e-05, "loss": 2.9535, "step": 441500 }, { "epoch": 351.0, "eval_loss": 2.8072855472564697, "eval_runtime": 69.4815, "eval_samples_per_second": 6181.863, "eval_steps_per_second": 2.015, "step": 441909 }, { "epoch": 351.07, "learning_rate": 1.2978554408260527e-05, "loss": 2.9534, "step": 442000 }, { "epoch": 351.47, "learning_rate": 1.2970611596505165e-05, "loss": 2.9516, "step": 442500 }, { "epoch": 351.87, "learning_rate": 1.2962668784749803e-05, "loss": 2.9535, "step": 443000 }, { "epoch": 352.0, "eval_loss": 2.804767370223999, "eval_runtime": 67.498, "eval_samples_per_second": 6363.519, "eval_steps_per_second": 2.074, "step": 443168 }, { "epoch": 352.26, "learning_rate": 1.2954725972994443e-05, "loss": 2.9509, "step": 443500 }, { "epoch": 352.66, "learning_rate": 1.294678316123908e-05, "loss": 2.9504, "step": 444000 }, { "epoch": 353.0, "eval_loss": 2.800147294998169, "eval_runtime": 65.5589, "eval_samples_per_second": 6551.742, "eval_steps_per_second": 2.135, "step": 444427 }, { "epoch": 353.06, "learning_rate": 1.2938840349483719e-05, "loss": 2.9515, "step": 444500 }, { "epoch": 353.46, "learning_rate": 1.2930897537728357e-05, "loss": 2.9522, "step": 445000 }, { "epoch": 353.85, "learning_rate": 1.2922954725972997e-05, "loss": 2.9519, "step": 445500 }, { "epoch": 354.0, "eval_loss": 2.804769992828369, "eval_runtime": 66.1586, "eval_samples_per_second": 6492.356, "eval_steps_per_second": 2.116, "step": 445686 }, { "epoch": 354.25, "learning_rate": 1.2915011914217635e-05, "loss": 2.9542, "step": 446000 }, { "epoch": 354.65, "learning_rate": 1.2907069102462273e-05, "loss": 2.9515, "step": 446500 }, { "epoch": 355.0, "eval_loss": 2.796786069869995, "eval_runtime": 66.4318, "eval_samples_per_second": 6465.653, "eval_steps_per_second": 2.107, "step": 446945 }, { "epoch": 355.04, "learning_rate": 1.2899126290706911e-05, "loss": 2.9531, "step": 447000 }, { "epoch": 355.44, "learning_rate": 1.289118347895155e-05, "loss": 2.9526, "step": 447500 }, { "epoch": 355.84, "learning_rate": 1.2883240667196189e-05, "loss": 2.9482, "step": 448000 }, { "epoch": 356.0, "eval_loss": 2.8069205284118652, "eval_runtime": 65.9748, "eval_samples_per_second": 6510.438, "eval_steps_per_second": 2.122, "step": 448204 }, { "epoch": 356.24, "learning_rate": 1.2875297855440827e-05, "loss": 2.9476, "step": 448500 }, { "epoch": 356.63, "learning_rate": 1.2867355043685465e-05, "loss": 2.9504, "step": 449000 }, { "epoch": 357.0, "eval_loss": 2.805333137512207, "eval_runtime": 66.8953, "eval_samples_per_second": 6420.851, "eval_steps_per_second": 2.093, "step": 449463 }, { "epoch": 357.03, "learning_rate": 1.2859412231930103e-05, "loss": 2.9488, "step": 449500 }, { "epoch": 357.43, "learning_rate": 1.2851469420174743e-05, "loss": 2.9501, "step": 450000 }, { "epoch": 357.82, "learning_rate": 1.2843526608419381e-05, "loss": 2.9488, "step": 450500 }, { "epoch": 358.0, "eval_loss": 2.800183057785034, "eval_runtime": 67.2521, "eval_samples_per_second": 6386.785, "eval_steps_per_second": 2.082, "step": 450722 }, { "epoch": 358.22, "learning_rate": 1.283558379666402e-05, "loss": 2.9472, "step": 451000 }, { "epoch": 358.62, "learning_rate": 1.2827640984908657e-05, "loss": 2.9501, "step": 451500 }, { "epoch": 359.0, "eval_loss": 2.7997281551361084, "eval_runtime": 66.7989, "eval_samples_per_second": 6430.118, "eval_steps_per_second": 2.096, "step": 451981 }, { "epoch": 359.02, "learning_rate": 1.2819698173153297e-05, "loss": 2.9495, "step": 452000 }, { "epoch": 359.41, "learning_rate": 1.2811755361397935e-05, "loss": 2.9469, "step": 452500 }, { "epoch": 359.81, "learning_rate": 1.2803812549642573e-05, "loss": 2.9472, "step": 453000 }, { "epoch": 360.0, "eval_loss": 2.8012421131134033, "eval_runtime": 67.3789, "eval_samples_per_second": 6374.773, "eval_steps_per_second": 2.078, "step": 453240 }, { "epoch": 360.21, "learning_rate": 1.2795869737887212e-05, "loss": 2.948, "step": 453500 }, { "epoch": 360.6, "learning_rate": 1.2787926926131853e-05, "loss": 2.9452, "step": 454000 }, { "epoch": 361.0, "eval_loss": 2.8016791343688965, "eval_runtime": 67.474, "eval_samples_per_second": 6365.783, "eval_steps_per_second": 2.075, "step": 454499 }, { "epoch": 361.0, "learning_rate": 1.2779984114376491e-05, "loss": 2.9458, "step": 454500 }, { "epoch": 361.4, "learning_rate": 1.277204130262113e-05, "loss": 2.9481, "step": 455000 }, { "epoch": 361.8, "learning_rate": 1.2764098490865769e-05, "loss": 2.9463, "step": 455500 }, { "epoch": 362.0, "eval_loss": 2.8014163970947266, "eval_runtime": 66.1269, "eval_samples_per_second": 6495.467, "eval_steps_per_second": 2.117, "step": 455758 }, { "epoch": 362.19, "learning_rate": 1.2756155679110407e-05, "loss": 2.946, "step": 456000 }, { "epoch": 362.59, "learning_rate": 1.2748212867355045e-05, "loss": 2.9448, "step": 456500 }, { "epoch": 362.99, "learning_rate": 1.2740270055599683e-05, "loss": 2.9502, "step": 457000 }, { "epoch": 363.0, "eval_loss": 2.7978971004486084, "eval_runtime": 65.8625, "eval_samples_per_second": 6521.538, "eval_steps_per_second": 2.126, "step": 457017 }, { "epoch": 363.38, "learning_rate": 1.2732327243844323e-05, "loss": 2.944, "step": 457500 }, { "epoch": 363.78, "learning_rate": 1.2724384432088961e-05, "loss": 2.9434, "step": 458000 }, { "epoch": 364.0, "eval_loss": 2.79315185546875, "eval_runtime": 66.3366, "eval_samples_per_second": 6474.936, "eval_steps_per_second": 2.11, "step": 458276 }, { "epoch": 364.18, "learning_rate": 1.27164416203336e-05, "loss": 2.9431, "step": 458500 }, { "epoch": 364.58, "learning_rate": 1.2708498808578237e-05, "loss": 2.9448, "step": 459000 }, { "epoch": 364.97, "learning_rate": 1.2700555996822876e-05, "loss": 2.9444, "step": 459500 }, { "epoch": 365.0, "eval_loss": 2.793963670730591, "eval_runtime": 65.6461, "eval_samples_per_second": 6543.036, "eval_steps_per_second": 2.133, "step": 459535 }, { "epoch": 365.37, "learning_rate": 1.2692613185067515e-05, "loss": 2.944, "step": 460000 }, { "epoch": 365.77, "learning_rate": 1.2684670373312154e-05, "loss": 2.9404, "step": 460500 }, { "epoch": 366.0, "eval_loss": 2.793867826461792, "eval_runtime": 66.6727, "eval_samples_per_second": 6442.294, "eval_steps_per_second": 2.1, "step": 460794 }, { "epoch": 366.16, "learning_rate": 1.2676727561556792e-05, "loss": 2.9416, "step": 461000 }, { "epoch": 366.56, "learning_rate": 1.266878474980143e-05, "loss": 2.9435, "step": 461500 }, { "epoch": 366.96, "learning_rate": 1.266084193804607e-05, "loss": 2.9404, "step": 462000 }, { "epoch": 367.0, "eval_loss": 2.790978193283081, "eval_runtime": 67.5851, "eval_samples_per_second": 6355.323, "eval_steps_per_second": 2.071, "step": 462053 }, { "epoch": 367.36, "learning_rate": 1.2652899126290708e-05, "loss": 2.9423, "step": 462500 }, { "epoch": 367.75, "learning_rate": 1.2644956314535346e-05, "loss": 2.9425, "step": 463000 }, { "epoch": 368.0, "eval_loss": 2.7897701263427734, "eval_runtime": 66.5006, "eval_samples_per_second": 6458.969, "eval_steps_per_second": 2.105, "step": 463312 }, { "epoch": 368.15, "learning_rate": 1.2637013502779984e-05, "loss": 2.9428, "step": 463500 }, { "epoch": 368.55, "learning_rate": 1.2629070691024624e-05, "loss": 2.943, "step": 464000 }, { "epoch": 368.94, "learning_rate": 1.2621127879269262e-05, "loss": 2.9424, "step": 464500 }, { "epoch": 369.0, "eval_loss": 2.7928731441497803, "eval_runtime": 66.8055, "eval_samples_per_second": 6429.482, "eval_steps_per_second": 2.096, "step": 464571 }, { "epoch": 369.34, "learning_rate": 1.26131850675139e-05, "loss": 2.9403, "step": 465000 }, { "epoch": 369.74, "learning_rate": 1.2605242255758538e-05, "loss": 2.9418, "step": 465500 }, { "epoch": 370.0, "eval_loss": 2.789602518081665, "eval_runtime": 66.6626, "eval_samples_per_second": 6443.272, "eval_steps_per_second": 2.1, "step": 465830 }, { "epoch": 370.14, "learning_rate": 1.2597299444003178e-05, "loss": 2.9427, "step": 466000 }, { "epoch": 370.53, "learning_rate": 1.2589356632247818e-05, "loss": 2.9428, "step": 466500 }, { "epoch": 370.93, "learning_rate": 1.2581413820492456e-05, "loss": 2.9392, "step": 467000 }, { "epoch": 371.0, "eval_loss": 2.7863101959228516, "eval_runtime": 65.8609, "eval_samples_per_second": 6521.704, "eval_steps_per_second": 2.126, "step": 467089 }, { "epoch": 371.33, "learning_rate": 1.2573471008737096e-05, "loss": 2.9405, "step": 467500 }, { "epoch": 371.72, "learning_rate": 1.2565528196981734e-05, "loss": 2.9382, "step": 468000 }, { "epoch": 372.0, "eval_loss": 2.790802001953125, "eval_runtime": 66.3739, "eval_samples_per_second": 6471.289, "eval_steps_per_second": 2.109, "step": 468348 }, { "epoch": 372.12, "learning_rate": 1.2557585385226372e-05, "loss": 2.9396, "step": 468500 }, { "epoch": 372.52, "learning_rate": 1.254964257347101e-05, "loss": 2.9371, "step": 469000 }, { "epoch": 372.92, "learning_rate": 1.254169976171565e-05, "loss": 2.9368, "step": 469500 }, { "epoch": 373.0, "eval_loss": 2.792484760284424, "eval_runtime": 69.3639, "eval_samples_per_second": 6192.339, "eval_steps_per_second": 2.018, "step": 469607 }, { "epoch": 373.31, "learning_rate": 1.2533756949960288e-05, "loss": 2.9369, "step": 470000 }, { "epoch": 373.71, "learning_rate": 1.2525814138204926e-05, "loss": 2.9364, "step": 470500 }, { "epoch": 374.0, "eval_loss": 2.7865025997161865, "eval_runtime": 66.0757, "eval_samples_per_second": 6500.497, "eval_steps_per_second": 2.119, "step": 470866 }, { "epoch": 374.11, "learning_rate": 1.2517871326449564e-05, "loss": 2.9362, "step": 471000 }, { "epoch": 374.5, "learning_rate": 1.2509928514694202e-05, "loss": 2.9365, "step": 471500 }, { "epoch": 374.9, "learning_rate": 1.2501985702938842e-05, "loss": 2.9401, "step": 472000 }, { "epoch": 375.0, "eval_loss": 2.7915077209472656, "eval_runtime": 67.2409, "eval_samples_per_second": 6387.85, "eval_steps_per_second": 2.082, "step": 472125 }, { "epoch": 375.3, "learning_rate": 1.249404289118348e-05, "loss": 2.9371, "step": 472500 }, { "epoch": 375.69, "learning_rate": 1.2486100079428118e-05, "loss": 2.9386, "step": 473000 }, { "epoch": 376.0, "eval_loss": 2.78747296333313, "eval_runtime": 66.0494, "eval_samples_per_second": 6503.092, "eval_steps_per_second": 2.12, "step": 473384 }, { "epoch": 376.09, "learning_rate": 1.2478157267672756e-05, "loss": 2.9365, "step": 473500 }, { "epoch": 376.49, "learning_rate": 1.2470214455917396e-05, "loss": 2.936, "step": 474000 }, { "epoch": 376.89, "learning_rate": 1.2462271644162034e-05, "loss": 2.9353, "step": 474500 }, { "epoch": 377.0, "eval_loss": 2.787083387374878, "eval_runtime": 65.1819, "eval_samples_per_second": 6589.637, "eval_steps_per_second": 2.148, "step": 474643 }, { "epoch": 377.28, "learning_rate": 1.2454328832406672e-05, "loss": 2.9346, "step": 475000 }, { "epoch": 377.68, "learning_rate": 1.244638602065131e-05, "loss": 2.935, "step": 475500 }, { "epoch": 378.0, "eval_loss": 2.7840054035186768, "eval_runtime": 65.6272, "eval_samples_per_second": 6544.922, "eval_steps_per_second": 2.133, "step": 475902 }, { "epoch": 378.08, "learning_rate": 1.243844320889595e-05, "loss": 2.9349, "step": 476000 }, { "epoch": 378.47, "learning_rate": 1.2430500397140588e-05, "loss": 2.9326, "step": 476500 }, { "epoch": 378.87, "learning_rate": 1.2422557585385226e-05, "loss": 2.9359, "step": 477000 }, { "epoch": 379.0, "eval_loss": 2.786746025085449, "eval_runtime": 65.2189, "eval_samples_per_second": 6585.899, "eval_steps_per_second": 2.147, "step": 477161 }, { "epoch": 379.27, "learning_rate": 1.2414614773629865e-05, "loss": 2.9329, "step": 477500 }, { "epoch": 379.67, "learning_rate": 1.2406671961874504e-05, "loss": 2.9351, "step": 478000 }, { "epoch": 380.0, "eval_loss": 2.7838714122772217, "eval_runtime": 64.82, "eval_samples_per_second": 6626.432, "eval_steps_per_second": 2.16, "step": 478420 }, { "epoch": 380.06, "learning_rate": 1.2398729150119144e-05, "loss": 2.9343, "step": 478500 }, { "epoch": 380.46, "learning_rate": 1.2390786338363782e-05, "loss": 2.9315, "step": 479000 }, { "epoch": 380.86, "learning_rate": 1.2382843526608422e-05, "loss": 2.9311, "step": 479500 }, { "epoch": 381.0, "eval_loss": 2.7795498371124268, "eval_runtime": 65.9238, "eval_samples_per_second": 6515.476, "eval_steps_per_second": 2.124, "step": 479679 }, { "epoch": 381.25, "learning_rate": 1.237490071485306e-05, "loss": 2.9333, "step": 480000 }, { "epoch": 381.65, "learning_rate": 1.2366957903097698e-05, "loss": 2.9309, "step": 480500 }, { "epoch": 382.0, "eval_loss": 2.7884206771850586, "eval_runtime": 66.7269, "eval_samples_per_second": 6437.062, "eval_steps_per_second": 2.098, "step": 480938 }, { "epoch": 382.05, "learning_rate": 1.2359015091342336e-05, "loss": 2.9318, "step": 481000 }, { "epoch": 382.45, "learning_rate": 1.2351072279586974e-05, "loss": 2.9315, "step": 481500 }, { "epoch": 382.84, "learning_rate": 1.2343129467831614e-05, "loss": 2.931, "step": 482000 }, { "epoch": 383.0, "eval_loss": 2.7798800468444824, "eval_runtime": 65.4132, "eval_samples_per_second": 6566.33, "eval_steps_per_second": 2.14, "step": 482197 }, { "epoch": 383.24, "learning_rate": 1.2335186656076252e-05, "loss": 2.9309, "step": 482500 }, { "epoch": 383.64, "learning_rate": 1.232724384432089e-05, "loss": 2.9342, "step": 483000 }, { "epoch": 384.0, "eval_loss": 2.7786638736724854, "eval_runtime": 66.3534, "eval_samples_per_second": 6473.294, "eval_steps_per_second": 2.11, "step": 483456 }, { "epoch": 384.03, "learning_rate": 1.2319301032565529e-05, "loss": 2.9304, "step": 483500 }, { "epoch": 384.43, "learning_rate": 1.2311358220810168e-05, "loss": 2.9314, "step": 484000 }, { "epoch": 384.83, "learning_rate": 1.2303415409054807e-05, "loss": 2.9309, "step": 484500 }, { "epoch": 385.0, "eval_loss": 2.7783045768737793, "eval_runtime": 65.3253, "eval_samples_per_second": 6575.173, "eval_steps_per_second": 2.143, "step": 484715 }, { "epoch": 385.23, "learning_rate": 1.2295472597299445e-05, "loss": 2.931, "step": 485000 }, { "epoch": 385.62, "learning_rate": 1.2287529785544083e-05, "loss": 2.9301, "step": 485500 }, { "epoch": 386.0, "eval_loss": 2.7856228351593018, "eval_runtime": 67.7422, "eval_samples_per_second": 6340.581, "eval_steps_per_second": 2.067, "step": 485974 }, { "epoch": 386.02, "learning_rate": 1.2279586973788723e-05, "loss": 2.9298, "step": 486000 }, { "epoch": 386.42, "learning_rate": 1.227164416203336e-05, "loss": 2.9289, "step": 486500 }, { "epoch": 386.81, "learning_rate": 1.2263701350277999e-05, "loss": 2.9313, "step": 487000 }, { "epoch": 387.0, "eval_loss": 2.777822256088257, "eval_runtime": 65.9196, "eval_samples_per_second": 6515.896, "eval_steps_per_second": 2.124, "step": 487233 }, { "epoch": 387.21, "learning_rate": 1.2255758538522637e-05, "loss": 2.9278, "step": 487500 }, { "epoch": 387.61, "learning_rate": 1.2247815726767277e-05, "loss": 2.9272, "step": 488000 }, { "epoch": 388.0, "eval_loss": 2.7712631225585938, "eval_runtime": 65.7289, "eval_samples_per_second": 6534.794, "eval_steps_per_second": 2.13, "step": 488492 }, { "epoch": 388.01, "learning_rate": 1.2239872915011915e-05, "loss": 2.9284, "step": 488500 }, { "epoch": 388.4, "learning_rate": 1.2231930103256553e-05, "loss": 2.9268, "step": 489000 }, { "epoch": 388.8, "learning_rate": 1.2223987291501191e-05, "loss": 2.9256, "step": 489500 }, { "epoch": 389.0, "eval_loss": 2.7801990509033203, "eval_runtime": 66.8575, "eval_samples_per_second": 6424.487, "eval_steps_per_second": 2.094, "step": 489751 }, { "epoch": 389.2, "learning_rate": 1.2216044479745829e-05, "loss": 2.9241, "step": 490000 }, { "epoch": 389.59, "learning_rate": 1.220810166799047e-05, "loss": 2.9298, "step": 490500 }, { "epoch": 389.99, "learning_rate": 1.2200158856235109e-05, "loss": 2.9273, "step": 491000 }, { "epoch": 390.0, "eval_loss": 2.777092933654785, "eval_runtime": 65.5776, "eval_samples_per_second": 6549.871, "eval_steps_per_second": 2.135, "step": 491010 }, { "epoch": 390.39, "learning_rate": 1.2192216044479749e-05, "loss": 2.9264, "step": 491500 }, { "epoch": 390.79, "learning_rate": 1.2184273232724387e-05, "loss": 2.9249, "step": 492000 }, { "epoch": 391.0, "eval_loss": 2.777336835861206, "eval_runtime": 67.2285, "eval_samples_per_second": 6389.03, "eval_steps_per_second": 2.082, "step": 492269 }, { "epoch": 391.18, "learning_rate": 1.2176330420969025e-05, "loss": 2.9273, "step": 492500 }, { "epoch": 391.58, "learning_rate": 1.2168387609213663e-05, "loss": 2.9263, "step": 493000 }, { "epoch": 391.98, "learning_rate": 1.2160444797458301e-05, "loss": 2.9256, "step": 493500 }, { "epoch": 392.0, "eval_loss": 2.7697529792785645, "eval_runtime": 65.4325, "eval_samples_per_second": 6564.394, "eval_steps_per_second": 2.14, "step": 493528 }, { "epoch": 392.37, "learning_rate": 1.215250198570294e-05, "loss": 2.9278, "step": 494000 }, { "epoch": 392.77, "learning_rate": 1.2144559173947579e-05, "loss": 2.926, "step": 494500 }, { "epoch": 393.0, "eval_loss": 2.780413866043091, "eval_runtime": 66.1003, "eval_samples_per_second": 6498.082, "eval_steps_per_second": 2.118, "step": 494787 }, { "epoch": 393.17, "learning_rate": 1.2136616362192217e-05, "loss": 2.9245, "step": 495000 }, { "epoch": 393.57, "learning_rate": 1.2128673550436855e-05, "loss": 2.9216, "step": 495500 }, { "epoch": 393.96, "learning_rate": 1.2120730738681495e-05, "loss": 2.9284, "step": 496000 }, { "epoch": 394.0, "eval_loss": 2.774165630340576, "eval_runtime": 65.4839, "eval_samples_per_second": 6559.243, "eval_steps_per_second": 2.138, "step": 496046 }, { "epoch": 394.36, "learning_rate": 1.2112787926926133e-05, "loss": 2.9233, "step": 496500 }, { "epoch": 394.76, "learning_rate": 1.2104845115170771e-05, "loss": 2.9218, "step": 497000 }, { "epoch": 395.0, "eval_loss": 2.771805763244629, "eval_runtime": 65.6215, "eval_samples_per_second": 6545.49, "eval_steps_per_second": 2.133, "step": 497305 }, { "epoch": 395.15, "learning_rate": 1.209690230341541e-05, "loss": 2.9215, "step": 497500 }, { "epoch": 395.55, "learning_rate": 1.2088959491660049e-05, "loss": 2.9254, "step": 498000 }, { "epoch": 395.95, "learning_rate": 1.2081016679904687e-05, "loss": 2.9229, "step": 498500 }, { "epoch": 396.0, "eval_loss": 2.777796983718872, "eval_runtime": 65.7435, "eval_samples_per_second": 6533.344, "eval_steps_per_second": 2.129, "step": 498564 }, { "epoch": 396.35, "learning_rate": 1.2073073868149325e-05, "loss": 2.9237, "step": 499000 }, { "epoch": 396.74, "learning_rate": 1.2065131056393963e-05, "loss": 2.9232, "step": 499500 }, { "epoch": 397.0, "eval_loss": 2.774951457977295, "eval_runtime": 69.933, "eval_samples_per_second": 6141.947, "eval_steps_per_second": 2.002, "step": 499823 }, { "epoch": 397.14, "learning_rate": 1.2057188244638603e-05, "loss": 2.9244, "step": 500000 }, { "epoch": 397.54, "learning_rate": 1.2049245432883241e-05, "loss": 2.9236, "step": 500500 }, { "epoch": 397.93, "learning_rate": 1.204130262112788e-05, "loss": 2.9216, "step": 501000 }, { "epoch": 398.0, "eval_loss": 2.769907236099243, "eval_runtime": 65.2798, "eval_samples_per_second": 6579.75, "eval_steps_per_second": 2.145, "step": 501082 }, { "epoch": 398.33, "learning_rate": 1.2033359809372518e-05, "loss": 2.9218, "step": 501500 }, { "epoch": 398.73, "learning_rate": 1.2025416997617156e-05, "loss": 2.9244, "step": 502000 }, { "epoch": 399.0, "eval_loss": 2.7754454612731934, "eval_runtime": 67.1212, "eval_samples_per_second": 6399.245, "eval_steps_per_second": 2.086, "step": 502341 }, { "epoch": 399.13, "learning_rate": 1.2017474185861797e-05, "loss": 2.9214, "step": 502500 }, { "epoch": 399.52, "learning_rate": 1.2009531374106435e-05, "loss": 2.9212, "step": 503000 }, { "epoch": 399.92, "learning_rate": 1.2001588562351073e-05, "loss": 2.9224, "step": 503500 }, { "epoch": 400.0, "eval_loss": 2.773928165435791, "eval_runtime": 68.0882, "eval_samples_per_second": 6308.362, "eval_steps_per_second": 2.056, "step": 503600 }, { "epoch": 400.32, "learning_rate": 1.1993645750595713e-05, "loss": 2.9224, "step": 504000 }, { "epoch": 400.71, "learning_rate": 1.1985702938840351e-05, "loss": 2.9187, "step": 504500 }, { "epoch": 401.0, "eval_loss": 2.770015239715576, "eval_runtime": 66.4695, "eval_samples_per_second": 6461.983, "eval_steps_per_second": 2.106, "step": 504859 }, { "epoch": 401.11, "learning_rate": 1.197776012708499e-05, "loss": 2.9195, "step": 505000 }, { "epoch": 401.51, "learning_rate": 1.1969817315329627e-05, "loss": 2.9208, "step": 505500 }, { "epoch": 401.91, "learning_rate": 1.1961874503574267e-05, "loss": 2.9185, "step": 506000 }, { "epoch": 402.0, "eval_loss": 2.7690629959106445, "eval_runtime": 65.2987, "eval_samples_per_second": 6577.849, "eval_steps_per_second": 2.144, "step": 506118 }, { "epoch": 402.3, "learning_rate": 1.1953931691818905e-05, "loss": 2.9182, "step": 506500 }, { "epoch": 402.7, "learning_rate": 1.1945988880063544e-05, "loss": 2.9187, "step": 507000 }, { "epoch": 403.0, "eval_loss": 2.771662473678589, "eval_runtime": 66.4081, "eval_samples_per_second": 6467.956, "eval_steps_per_second": 2.108, "step": 507377 }, { "epoch": 403.1, "learning_rate": 1.1938046068308182e-05, "loss": 2.919, "step": 507500 }, { "epoch": 403.49, "learning_rate": 1.1930103256552821e-05, "loss": 2.9203, "step": 508000 }, { "epoch": 403.89, "learning_rate": 1.192216044479746e-05, "loss": 2.9183, "step": 508500 }, { "epoch": 404.0, "eval_loss": 2.7712953090667725, "eval_runtime": 65.2198, "eval_samples_per_second": 6585.811, "eval_steps_per_second": 2.147, "step": 508636 }, { "epoch": 404.29, "learning_rate": 1.1914217633042098e-05, "loss": 2.9191, "step": 509000 }, { "epoch": 404.69, "learning_rate": 1.1906274821286736e-05, "loss": 2.9164, "step": 509500 }, { "epoch": 405.0, "eval_loss": 2.769975185394287, "eval_runtime": 64.8779, "eval_samples_per_second": 6620.51, "eval_steps_per_second": 2.158, "step": 509895 }, { "epoch": 405.08, "learning_rate": 1.1898332009531376e-05, "loss": 2.9174, "step": 510000 }, { "epoch": 405.48, "learning_rate": 1.1890389197776014e-05, "loss": 2.9177, "step": 510500 }, { "epoch": 405.88, "learning_rate": 1.1882446386020652e-05, "loss": 2.9177, "step": 511000 }, { "epoch": 406.0, "eval_loss": 2.7673566341400146, "eval_runtime": 66.89, "eval_samples_per_second": 6421.362, "eval_steps_per_second": 2.093, "step": 511154 }, { "epoch": 406.27, "learning_rate": 1.187450357426529e-05, "loss": 2.9179, "step": 511500 }, { "epoch": 406.67, "learning_rate": 1.1866560762509928e-05, "loss": 2.9172, "step": 512000 }, { "epoch": 407.0, "eval_loss": 2.768249273300171, "eval_runtime": 65.2421, "eval_samples_per_second": 6583.558, "eval_steps_per_second": 2.146, "step": 512413 }, { "epoch": 407.07, "learning_rate": 1.1858617950754568e-05, "loss": 2.9173, "step": 512500 }, { "epoch": 407.47, "learning_rate": 1.1850675138999206e-05, "loss": 2.9177, "step": 513000 }, { "epoch": 407.86, "learning_rate": 1.1842732327243844e-05, "loss": 2.9141, "step": 513500 }, { "epoch": 408.0, "eval_loss": 2.7689473628997803, "eval_runtime": 65.9507, "eval_samples_per_second": 6512.823, "eval_steps_per_second": 2.123, "step": 513672 }, { "epoch": 408.26, "learning_rate": 1.1834789515488482e-05, "loss": 2.9157, "step": 514000 }, { "epoch": 408.66, "learning_rate": 1.1826846703733124e-05, "loss": 2.9142, "step": 514500 }, { "epoch": 409.0, "eval_loss": 2.767486572265625, "eval_runtime": 65.8424, "eval_samples_per_second": 6523.532, "eval_steps_per_second": 2.126, "step": 514931 }, { "epoch": 409.05, "learning_rate": 1.1818903891977762e-05, "loss": 2.9131, "step": 515000 }, { "epoch": 409.45, "learning_rate": 1.18109610802224e-05, "loss": 2.9141, "step": 515500 }, { "epoch": 409.85, "learning_rate": 1.180301826846704e-05, "loss": 2.9167, "step": 516000 }, { "epoch": 410.0, "eval_loss": 2.7696990966796875, "eval_runtime": 69.5031, "eval_samples_per_second": 6179.941, "eval_steps_per_second": 2.014, "step": 516190 }, { "epoch": 410.25, "learning_rate": 1.1795075456711678e-05, "loss": 2.9154, "step": 516500 }, { "epoch": 410.64, "learning_rate": 1.1787132644956316e-05, "loss": 2.9148, "step": 517000 }, { "epoch": 411.0, "eval_loss": 2.7675933837890625, "eval_runtime": 65.4507, "eval_samples_per_second": 6562.57, "eval_steps_per_second": 2.139, "step": 517449 }, { "epoch": 411.04, "learning_rate": 1.1779189833200954e-05, "loss": 2.9145, "step": 517500 }, { "epoch": 411.44, "learning_rate": 1.1771247021445594e-05, "loss": 2.9125, "step": 518000 }, { "epoch": 411.83, "learning_rate": 1.1763304209690232e-05, "loss": 2.9137, "step": 518500 }, { "epoch": 412.0, "eval_loss": 2.764453172683716, "eval_runtime": 64.8723, "eval_samples_per_second": 6621.083, "eval_steps_per_second": 2.158, "step": 518708 }, { "epoch": 412.23, "learning_rate": 1.175536139793487e-05, "loss": 2.913, "step": 519000 }, { "epoch": 412.63, "learning_rate": 1.1747418586179508e-05, "loss": 2.9115, "step": 519500 }, { "epoch": 413.0, "eval_loss": 2.766505241394043, "eval_runtime": 67.0343, "eval_samples_per_second": 6407.54, "eval_steps_per_second": 2.088, "step": 519967 }, { "epoch": 413.03, "learning_rate": 1.1739475774424148e-05, "loss": 2.9116, "step": 520000 }, { "epoch": 413.42, "learning_rate": 1.1731532962668786e-05, "loss": 2.9115, "step": 520500 }, { "epoch": 413.82, "learning_rate": 1.1723590150913424e-05, "loss": 2.9137, "step": 521000 }, { "epoch": 414.0, "eval_loss": 2.764331817626953, "eval_runtime": 65.3538, "eval_samples_per_second": 6572.3, "eval_steps_per_second": 2.142, "step": 521226 }, { "epoch": 414.22, "learning_rate": 1.1715647339158062e-05, "loss": 2.9139, "step": 521500 }, { "epoch": 414.61, "learning_rate": 1.1707704527402702e-05, "loss": 2.9157, "step": 522000 }, { "epoch": 415.0, "eval_loss": 2.7618625164031982, "eval_runtime": 65.4175, "eval_samples_per_second": 6565.903, "eval_steps_per_second": 2.14, "step": 522485 }, { "epoch": 415.01, "learning_rate": 1.169976171564734e-05, "loss": 2.9131, "step": 522500 }, { "epoch": 415.41, "learning_rate": 1.1691818903891978e-05, "loss": 2.9109, "step": 523000 }, { "epoch": 415.81, "learning_rate": 1.1683876092136616e-05, "loss": 2.9144, "step": 523500 }, { "epoch": 416.0, "eval_loss": 2.762502670288086, "eval_runtime": 66.4079, "eval_samples_per_second": 6467.976, "eval_steps_per_second": 2.108, "step": 523744 }, { "epoch": 416.2, "learning_rate": 1.1675933280381255e-05, "loss": 2.9108, "step": 524000 }, { "epoch": 416.6, "learning_rate": 1.1667990468625894e-05, "loss": 2.9102, "step": 524500 }, { "epoch": 417.0, "learning_rate": 1.1660047656870532e-05, "loss": 2.9116, "step": 525000 }, { "epoch": 417.0, "eval_loss": 2.759247064590454, "eval_runtime": 66.4232, "eval_samples_per_second": 6466.487, "eval_steps_per_second": 2.108, "step": 525003 }, { "epoch": 417.39, "learning_rate": 1.165210484511517e-05, "loss": 2.9095, "step": 525500 }, { "epoch": 417.79, "learning_rate": 1.1644162033359809e-05, "loss": 2.9099, "step": 526000 }, { "epoch": 418.0, "eval_loss": 2.762789011001587, "eval_runtime": 66.9865, "eval_samples_per_second": 6412.111, "eval_steps_per_second": 2.09, "step": 526262 }, { "epoch": 418.19, "learning_rate": 1.1636219221604448e-05, "loss": 2.9104, "step": 526500 }, { "epoch": 418.59, "learning_rate": 1.1628276409849088e-05, "loss": 2.9101, "step": 527000 }, { "epoch": 418.98, "learning_rate": 1.1620333598093726e-05, "loss": 2.9091, "step": 527500 }, { "epoch": 419.0, "eval_loss": 2.761685609817505, "eval_runtime": 66.6974, "eval_samples_per_second": 6439.907, "eval_steps_per_second": 2.099, "step": 527521 }, { "epoch": 419.38, "learning_rate": 1.1612390786338366e-05, "loss": 2.9131, "step": 528000 }, { "epoch": 419.78, "learning_rate": 1.1604447974583004e-05, "loss": 2.9097, "step": 528500 }, { "epoch": 420.0, "eval_loss": 2.766441822052002, "eval_runtime": 66.54, "eval_samples_per_second": 6455.138, "eval_steps_per_second": 2.104, "step": 528780 }, { "epoch": 420.17, "learning_rate": 1.1596505162827642e-05, "loss": 2.9086, "step": 529000 }, { "epoch": 420.57, "learning_rate": 1.158856235107228e-05, "loss": 2.9063, "step": 529500 }, { "epoch": 420.97, "learning_rate": 1.158061953931692e-05, "loss": 2.9087, "step": 530000 }, { "epoch": 421.0, "eval_loss": 2.767285108566284, "eval_runtime": 69.79, "eval_samples_per_second": 6154.537, "eval_steps_per_second": 2.006, "step": 530039 }, { "epoch": 421.37, "learning_rate": 1.1572676727561558e-05, "loss": 2.9074, "step": 530500 }, { "epoch": 421.76, "learning_rate": 1.1564733915806197e-05, "loss": 2.9093, "step": 531000 }, { "epoch": 422.0, "eval_loss": 2.7607202529907227, "eval_runtime": 65.9215, "eval_samples_per_second": 6515.709, "eval_steps_per_second": 2.124, "step": 531298 }, { "epoch": 422.16, "learning_rate": 1.1556791104050835e-05, "loss": 2.9072, "step": 531500 }, { "epoch": 422.56, "learning_rate": 1.1548848292295474e-05, "loss": 2.9087, "step": 532000 }, { "epoch": 422.95, "learning_rate": 1.1540905480540113e-05, "loss": 2.9075, "step": 532500 }, { "epoch": 423.0, "eval_loss": 2.7596447467803955, "eval_runtime": 67.3509, "eval_samples_per_second": 6377.423, "eval_steps_per_second": 2.079, "step": 532557 }, { "epoch": 423.35, "learning_rate": 1.153296266878475e-05, "loss": 2.9045, "step": 533000 }, { "epoch": 423.75, "learning_rate": 1.1525019857029389e-05, "loss": 2.9065, "step": 533500 }, { "epoch": 424.0, "eval_loss": 2.760394811630249, "eval_runtime": 65.4438, "eval_samples_per_second": 6563.265, "eval_steps_per_second": 2.139, "step": 533816 }, { "epoch": 424.15, "learning_rate": 1.1517077045274027e-05, "loss": 2.9087, "step": 534000 }, { "epoch": 424.54, "learning_rate": 1.1509134233518667e-05, "loss": 2.9061, "step": 534500 }, { "epoch": 424.94, "learning_rate": 1.1501191421763305e-05, "loss": 2.9063, "step": 535000 }, { "epoch": 425.0, "eval_loss": 2.7589619159698486, "eval_runtime": 65.3924, "eval_samples_per_second": 6568.426, "eval_steps_per_second": 2.141, "step": 535075 }, { "epoch": 425.34, "learning_rate": 1.1493248610007943e-05, "loss": 2.9039, "step": 535500 }, { "epoch": 425.73, "learning_rate": 1.1485305798252581e-05, "loss": 2.9016, "step": 536000 }, { "epoch": 426.0, "eval_loss": 2.7568178176879883, "eval_runtime": 67.492, "eval_samples_per_second": 6364.087, "eval_steps_per_second": 2.074, "step": 536334 }, { "epoch": 426.13, "learning_rate": 1.147736298649722e-05, "loss": 2.9032, "step": 536500 }, { "epoch": 426.53, "learning_rate": 1.1469420174741859e-05, "loss": 2.904, "step": 537000 }, { "epoch": 426.93, "learning_rate": 1.1461477362986497e-05, "loss": 2.9072, "step": 537500 }, { "epoch": 427.0, "eval_loss": 2.7628705501556396, "eval_runtime": 65.655, "eval_samples_per_second": 6542.156, "eval_steps_per_second": 2.132, "step": 537593 }, { "epoch": 427.32, "learning_rate": 1.1453534551231135e-05, "loss": 2.9059, "step": 538000 }, { "epoch": 427.72, "learning_rate": 1.1445591739475775e-05, "loss": 2.9052, "step": 538500 }, { "epoch": 428.0, "eval_loss": 2.7568562030792236, "eval_runtime": 67.6325, "eval_samples_per_second": 6350.867, "eval_steps_per_second": 2.07, "step": 538852 }, { "epoch": 428.12, "learning_rate": 1.1437648927720415e-05, "loss": 2.9052, "step": 539000 }, { "epoch": 428.51, "learning_rate": 1.1429706115965053e-05, "loss": 2.9033, "step": 539500 }, { "epoch": 428.91, "learning_rate": 1.1421763304209693e-05, "loss": 2.9061, "step": 540000 }, { "epoch": 429.0, "eval_loss": 2.758477210998535, "eval_runtime": 66.4539, "eval_samples_per_second": 6463.506, "eval_steps_per_second": 2.107, "step": 540111 }, { "epoch": 429.31, "learning_rate": 1.141382049245433e-05, "loss": 2.9039, "step": 540500 }, { "epoch": 429.71, "learning_rate": 1.1405877680698969e-05, "loss": 2.9028, "step": 541000 }, { "epoch": 430.0, "eval_loss": 2.7521698474884033, "eval_runtime": 65.9173, "eval_samples_per_second": 6516.119, "eval_steps_per_second": 2.124, "step": 541370 }, { "epoch": 430.1, "learning_rate": 1.1397934868943607e-05, "loss": 2.903, "step": 541500 }, { "epoch": 430.5, "learning_rate": 1.1389992057188247e-05, "loss": 2.9023, "step": 542000 }, { "epoch": 430.9, "learning_rate": 1.1382049245432885e-05, "loss": 2.9039, "step": 542500 }, { "epoch": 431.0, "eval_loss": 2.7577943801879883, "eval_runtime": 66.8314, "eval_samples_per_second": 6426.998, "eval_steps_per_second": 2.095, "step": 542629 }, { "epoch": 431.29, "learning_rate": 1.1374106433677523e-05, "loss": 2.9017, "step": 543000 }, { "epoch": 431.69, "learning_rate": 1.1366163621922161e-05, "loss": 2.9062, "step": 543500 }, { "epoch": 432.0, "eval_loss": 2.7584099769592285, "eval_runtime": 67.9866, "eval_samples_per_second": 6317.787, "eval_steps_per_second": 2.059, "step": 543888 }, { "epoch": 432.09, "learning_rate": 1.1358220810166801e-05, "loss": 2.9006, "step": 544000 }, { "epoch": 432.49, "learning_rate": 1.1350277998411439e-05, "loss": 2.9029, "step": 544500 }, { "epoch": 432.88, "learning_rate": 1.1342335186656077e-05, "loss": 2.9036, "step": 545000 }, { "epoch": 433.0, "eval_loss": 2.7526750564575195, "eval_runtime": 69.5258, "eval_samples_per_second": 6177.921, "eval_steps_per_second": 2.014, "step": 545147 }, { "epoch": 433.28, "learning_rate": 1.1334392374900715e-05, "loss": 2.906, "step": 545500 }, { "epoch": 433.68, "learning_rate": 1.1326449563145353e-05, "loss": 2.9036, "step": 546000 }, { "epoch": 434.0, "eval_loss": 2.7522451877593994, "eval_runtime": 67.4345, "eval_samples_per_second": 6369.511, "eval_steps_per_second": 2.076, "step": 546406 }, { "epoch": 434.07, "learning_rate": 1.1318506751389993e-05, "loss": 2.9047, "step": 546500 }, { "epoch": 434.47, "learning_rate": 1.1310563939634631e-05, "loss": 2.9003, "step": 547000 }, { "epoch": 434.87, "learning_rate": 1.130262112787927e-05, "loss": 2.9007, "step": 547500 }, { "epoch": 435.0, "eval_loss": 2.7565929889678955, "eval_runtime": 67.2987, "eval_samples_per_second": 6382.367, "eval_steps_per_second": 2.08, "step": 547665 }, { "epoch": 435.27, "learning_rate": 1.1294678316123908e-05, "loss": 2.8997, "step": 548000 }, { "epoch": 435.66, "learning_rate": 1.1286735504368547e-05, "loss": 2.9013, "step": 548500 }, { "epoch": 436.0, "eval_loss": 2.748744249343872, "eval_runtime": 68.9669, "eval_samples_per_second": 6227.988, "eval_steps_per_second": 2.03, "step": 548924 }, { "epoch": 436.06, "learning_rate": 1.1278792692613185e-05, "loss": 2.9, "step": 549000 }, { "epoch": 436.46, "learning_rate": 1.1270849880857824e-05, "loss": 2.8986, "step": 549500 }, { "epoch": 436.85, "learning_rate": 1.1262907069102462e-05, "loss": 2.9005, "step": 550000 }, { "epoch": 437.0, "eval_loss": 2.7514407634735107, "eval_runtime": 69.6387, "eval_samples_per_second": 6167.906, "eval_steps_per_second": 2.01, "step": 550183 }, { "epoch": 437.25, "learning_rate": 1.1254964257347101e-05, "loss": 2.9004, "step": 550500 }, { "epoch": 437.65, "learning_rate": 1.1247021445591741e-05, "loss": 2.9005, "step": 551000 }, { "epoch": 438.0, "eval_loss": 2.751617431640625, "eval_runtime": 65.6009, "eval_samples_per_second": 6547.545, "eval_steps_per_second": 2.134, "step": 551442 }, { "epoch": 438.05, "learning_rate": 1.123907863383638e-05, "loss": 2.9003, "step": 551500 }, { "epoch": 438.44, "learning_rate": 1.123113582208102e-05, "loss": 2.897, "step": 552000 }, { "epoch": 438.84, "learning_rate": 1.1223193010325657e-05, "loss": 2.9, "step": 552500 }, { "epoch": 439.0, "eval_loss": 2.749370574951172, "eval_runtime": 67.5464, "eval_samples_per_second": 6358.958, "eval_steps_per_second": 2.073, "step": 552701 }, { "epoch": 439.24, "learning_rate": 1.1215250198570295e-05, "loss": 2.8953, "step": 553000 }, { "epoch": 439.63, "learning_rate": 1.1207307386814934e-05, "loss": 2.8997, "step": 553500 }, { "epoch": 440.0, "eval_loss": 2.754873514175415, "eval_runtime": 66.358, "eval_samples_per_second": 6472.841, "eval_steps_per_second": 2.11, "step": 553960 }, { "epoch": 440.03, "learning_rate": 1.1199364575059573e-05, "loss": 2.8991, "step": 554000 }, { "epoch": 440.43, "learning_rate": 1.1191421763304211e-05, "loss": 2.8971, "step": 554500 }, { "epoch": 440.83, "learning_rate": 1.118347895154885e-05, "loss": 2.8974, "step": 555000 }, { "epoch": 441.0, "eval_loss": 2.7547402381896973, "eval_runtime": 69.4529, "eval_samples_per_second": 6184.407, "eval_steps_per_second": 2.016, "step": 555219 }, { "epoch": 441.22, "learning_rate": 1.1175536139793488e-05, "loss": 2.8978, "step": 555500 }, { "epoch": 441.62, "learning_rate": 1.1167593328038126e-05, "loss": 2.8972, "step": 556000 }, { "epoch": 442.0, "eval_loss": 2.751312255859375, "eval_runtime": 66.9326, "eval_samples_per_second": 6417.275, "eval_steps_per_second": 2.092, "step": 556478 }, { "epoch": 442.02, "learning_rate": 1.1159650516282766e-05, "loss": 2.8974, "step": 556500 }, { "epoch": 442.41, "learning_rate": 1.1151707704527404e-05, "loss": 2.8978, "step": 557000 }, { "epoch": 442.81, "learning_rate": 1.1143764892772042e-05, "loss": 2.8961, "step": 557500 }, { "epoch": 443.0, "eval_loss": 2.747730016708374, "eval_runtime": 69.1768, "eval_samples_per_second": 6209.087, "eval_steps_per_second": 2.024, "step": 557737 }, { "epoch": 443.21, "learning_rate": 1.113582208101668e-05, "loss": 2.897, "step": 558000 }, { "epoch": 443.61, "learning_rate": 1.112787926926132e-05, "loss": 2.896, "step": 558500 }, { "epoch": 444.0, "eval_loss": 2.7489864826202393, "eval_runtime": 70.2202, "eval_samples_per_second": 6116.827, "eval_steps_per_second": 1.994, "step": 558996 }, { "epoch": 444.0, "learning_rate": 1.1119936457505958e-05, "loss": 2.8964, "step": 559000 }, { "epoch": 444.4, "learning_rate": 1.1111993645750596e-05, "loss": 2.8979, "step": 559500 }, { "epoch": 444.8, "learning_rate": 1.1104050833995234e-05, "loss": 2.8946, "step": 560000 }, { "epoch": 445.0, "eval_loss": 2.7474751472473145, "eval_runtime": 66.7216, "eval_samples_per_second": 6437.569, "eval_steps_per_second": 2.098, "step": 560255 }, { "epoch": 445.19, "learning_rate": 1.1096108022239874e-05, "loss": 2.8972, "step": 560500 }, { "epoch": 445.59, "learning_rate": 1.1088165210484512e-05, "loss": 2.8944, "step": 561000 }, { "epoch": 445.99, "learning_rate": 1.108022239872915e-05, "loss": 2.8956, "step": 561500 }, { "epoch": 446.0, "eval_loss": 2.750474691390991, "eval_runtime": 65.6647, "eval_samples_per_second": 6541.19, "eval_steps_per_second": 2.132, "step": 561514 }, { "epoch": 446.39, "learning_rate": 1.1072279586973788e-05, "loss": 2.896, "step": 562000 }, { "epoch": 446.78, "learning_rate": 1.1064336775218428e-05, "loss": 2.8939, "step": 562500 }, { "epoch": 447.0, "eval_loss": 2.7446706295013428, "eval_runtime": 69.5444, "eval_samples_per_second": 6176.272, "eval_steps_per_second": 2.013, "step": 562773 }, { "epoch": 447.18, "learning_rate": 1.1056393963463068e-05, "loss": 2.8946, "step": 563000 }, { "epoch": 447.58, "learning_rate": 1.1048451151707706e-05, "loss": 2.8938, "step": 563500 }, { "epoch": 447.97, "learning_rate": 1.1040508339952346e-05, "loss": 2.8946, "step": 564000 }, { "epoch": 448.0, "eval_loss": 2.7440638542175293, "eval_runtime": 66.9207, "eval_samples_per_second": 6418.417, "eval_steps_per_second": 2.092, "step": 564032 }, { "epoch": 448.37, "learning_rate": 1.1032565528196984e-05, "loss": 2.8932, "step": 564500 }, { "epoch": 448.77, "learning_rate": 1.1024622716441622e-05, "loss": 2.8942, "step": 565000 }, { "epoch": 449.0, "eval_loss": 2.7514703273773193, "eval_runtime": 66.2749, "eval_samples_per_second": 6480.962, "eval_steps_per_second": 2.112, "step": 565291 }, { "epoch": 449.17, "learning_rate": 1.101667990468626e-05, "loss": 2.8942, "step": 565500 }, { "epoch": 449.56, "learning_rate": 1.10087370929309e-05, "loss": 2.8932, "step": 566000 }, { "epoch": 449.96, "learning_rate": 1.1000794281175538e-05, "loss": 2.8951, "step": 566500 }, { "epoch": 450.0, "eval_loss": 2.7446210384368896, "eval_runtime": 67.9522, "eval_samples_per_second": 6320.991, "eval_steps_per_second": 2.06, "step": 566550 }, { "epoch": 450.36, "learning_rate": 1.0992851469420176e-05, "loss": 2.8945, "step": 567000 }, { "epoch": 450.75, "learning_rate": 1.0984908657664814e-05, "loss": 2.8919, "step": 567500 }, { "epoch": 451.0, "eval_loss": 2.7470197677612305, "eval_runtime": 67.2468, "eval_samples_per_second": 6387.291, "eval_steps_per_second": 2.082, "step": 567809 }, { "epoch": 451.15, "learning_rate": 1.0976965845909452e-05, "loss": 2.8911, "step": 568000 }, { "epoch": 451.55, "learning_rate": 1.0969023034154092e-05, "loss": 2.8929, "step": 568500 }, { "epoch": 451.95, "learning_rate": 1.096108022239873e-05, "loss": 2.8926, "step": 569000 }, { "epoch": 452.0, "eval_loss": 2.7439239025115967, "eval_runtime": 66.5305, "eval_samples_per_second": 6456.064, "eval_steps_per_second": 2.104, "step": 569068 }, { "epoch": 452.34, "learning_rate": 1.0953137410643368e-05, "loss": 2.893, "step": 569500 }, { "epoch": 452.74, "learning_rate": 1.0945194598888006e-05, "loss": 2.8925, "step": 570000 }, { "epoch": 453.0, "eval_loss": 2.749398708343506, "eval_runtime": 67.0803, "eval_samples_per_second": 6403.142, "eval_steps_per_second": 2.087, "step": 570327 }, { "epoch": 453.14, "learning_rate": 1.0937251787132646e-05, "loss": 2.8911, "step": 570500 }, { "epoch": 453.53, "learning_rate": 1.0929308975377284e-05, "loss": 2.8924, "step": 571000 }, { "epoch": 453.93, "learning_rate": 1.0921366163621922e-05, "loss": 2.892, "step": 571500 }, { "epoch": 454.0, "eval_loss": 2.7378273010253906, "eval_runtime": 67.3646, "eval_samples_per_second": 6376.123, "eval_steps_per_second": 2.078, "step": 571586 }, { "epoch": 454.33, "learning_rate": 1.091342335186656e-05, "loss": 2.8927, "step": 572000 }, { "epoch": 454.73, "learning_rate": 1.09054805401112e-05, "loss": 2.8907, "step": 572500 }, { "epoch": 455.0, "eval_loss": 2.746629238128662, "eval_runtime": 68.1664, "eval_samples_per_second": 6301.127, "eval_steps_per_second": 2.054, "step": 572845 }, { "epoch": 455.12, "learning_rate": 1.0897537728355838e-05, "loss": 2.8915, "step": 573000 }, { "epoch": 455.52, "learning_rate": 1.0889594916600477e-05, "loss": 2.8936, "step": 573500 }, { "epoch": 455.92, "learning_rate": 1.0881652104845115e-05, "loss": 2.8934, "step": 574000 }, { "epoch": 456.0, "eval_loss": 2.7440550327301025, "eval_runtime": 66.5028, "eval_samples_per_second": 6458.748, "eval_steps_per_second": 2.105, "step": 574104 }, { "epoch": 456.31, "learning_rate": 1.0873709293089754e-05, "loss": 2.8905, "step": 574500 }, { "epoch": 456.71, "learning_rate": 1.0865766481334393e-05, "loss": 2.8877, "step": 575000 }, { "epoch": 457.0, "eval_loss": 2.7446813583374023, "eval_runtime": 67.2479, "eval_samples_per_second": 6387.191, "eval_steps_per_second": 2.082, "step": 575363 }, { "epoch": 457.11, "learning_rate": 1.0857823669579032e-05, "loss": 2.8905, "step": 575500 }, { "epoch": 457.51, "learning_rate": 1.0849880857823672e-05, "loss": 2.8897, "step": 576000 }, { "epoch": 457.9, "learning_rate": 1.084193804606831e-05, "loss": 2.8864, "step": 576500 }, { "epoch": 458.0, "eval_loss": 2.7404849529266357, "eval_runtime": 68.1249, "eval_samples_per_second": 6304.965, "eval_steps_per_second": 2.055, "step": 576622 }, { "epoch": 458.3, "learning_rate": 1.0833995234312948e-05, "loss": 2.8908, "step": 577000 }, { "epoch": 458.7, "learning_rate": 1.0826052422557587e-05, "loss": 2.8889, "step": 577500 }, { "epoch": 459.0, "eval_loss": 2.7393815517425537, "eval_runtime": 69.3558, "eval_samples_per_second": 6193.065, "eval_steps_per_second": 2.019, "step": 577881 }, { "epoch": 459.09, "learning_rate": 1.0818109610802225e-05, "loss": 2.8874, "step": 578000 }, { "epoch": 459.49, "learning_rate": 1.0810166799046864e-05, "loss": 2.8903, "step": 578500 }, { "epoch": 459.89, "learning_rate": 1.0802223987291503e-05, "loss": 2.8873, "step": 579000 }, { "epoch": 460.0, "eval_loss": 2.746785879135132, "eval_runtime": 68.3718, "eval_samples_per_second": 6282.193, "eval_steps_per_second": 2.048, "step": 579140 }, { "epoch": 460.29, "learning_rate": 1.079428117553614e-05, "loss": 2.8917, "step": 579500 }, { "epoch": 460.68, "learning_rate": 1.0786338363780779e-05, "loss": 2.8882, "step": 580000 }, { "epoch": 461.0, "eval_loss": 2.7414743900299072, "eval_runtime": 68.0937, "eval_samples_per_second": 6307.85, "eval_steps_per_second": 2.056, "step": 580399 }, { "epoch": 461.08, "learning_rate": 1.0778395552025419e-05, "loss": 2.8877, "step": 580500 }, { "epoch": 461.48, "learning_rate": 1.0770452740270057e-05, "loss": 2.8876, "step": 581000 }, { "epoch": 461.87, "learning_rate": 1.0762509928514695e-05, "loss": 2.8913, "step": 581500 }, { "epoch": 462.0, "eval_loss": 2.7423574924468994, "eval_runtime": 70.5348, "eval_samples_per_second": 6089.545, "eval_steps_per_second": 1.985, "step": 581658 }, { "epoch": 462.27, "learning_rate": 1.0754567116759333e-05, "loss": 2.8878, "step": 582000 }, { "epoch": 462.67, "learning_rate": 1.0746624305003973e-05, "loss": 2.8874, "step": 582500 }, { "epoch": 463.0, "eval_loss": 2.737351655960083, "eval_runtime": 67.7198, "eval_samples_per_second": 6342.682, "eval_steps_per_second": 2.067, "step": 582917 }, { "epoch": 463.07, "learning_rate": 1.073868149324861e-05, "loss": 2.8895, "step": 583000 }, { "epoch": 463.46, "learning_rate": 1.0730738681493249e-05, "loss": 2.8863, "step": 583500 }, { "epoch": 463.86, "learning_rate": 1.0722795869737887e-05, "loss": 2.886, "step": 584000 }, { "epoch": 464.0, "eval_loss": 2.7369706630706787, "eval_runtime": 67.489, "eval_samples_per_second": 6364.373, "eval_steps_per_second": 2.074, "step": 584176 }, { "epoch": 464.26, "learning_rate": 1.0714853057982527e-05, "loss": 2.8871, "step": 584500 }, { "epoch": 464.65, "learning_rate": 1.0706910246227165e-05, "loss": 2.8865, "step": 585000 }, { "epoch": 465.0, "eval_loss": 2.741349220275879, "eval_runtime": 69.8403, "eval_samples_per_second": 6150.1, "eval_steps_per_second": 2.005, "step": 585435 }, { "epoch": 465.05, "learning_rate": 1.0698967434471803e-05, "loss": 2.8875, "step": 585500 }, { "epoch": 465.45, "learning_rate": 1.0691024622716441e-05, "loss": 2.888, "step": 586000 }, { "epoch": 465.85, "learning_rate": 1.068308181096108e-05, "loss": 2.8843, "step": 586500 }, { "epoch": 466.0, "eval_loss": 2.7353320121765137, "eval_runtime": 68.1786, "eval_samples_per_second": 6300.001, "eval_steps_per_second": 2.053, "step": 586694 }, { "epoch": 466.24, "learning_rate": 1.0675138999205719e-05, "loss": 2.8873, "step": 587000 }, { "epoch": 466.64, "learning_rate": 1.0667196187450359e-05, "loss": 2.8832, "step": 587500 }, { "epoch": 467.0, "eval_loss": 2.7399463653564453, "eval_runtime": 68.0553, "eval_samples_per_second": 6311.407, "eval_steps_per_second": 2.057, "step": 587953 }, { "epoch": 467.04, "learning_rate": 1.0659253375694997e-05, "loss": 2.8855, "step": 588000 }, { "epoch": 467.43, "learning_rate": 1.0651310563939637e-05, "loss": 2.8835, "step": 588500 }, { "epoch": 467.83, "learning_rate": 1.0643367752184275e-05, "loss": 2.8858, "step": 589000 }, { "epoch": 468.0, "eval_loss": 2.7437031269073486, "eval_runtime": 67.618, "eval_samples_per_second": 6352.23, "eval_steps_per_second": 2.07, "step": 589212 }, { "epoch": 468.23, "learning_rate": 1.0635424940428913e-05, "loss": 2.8852, "step": 589500 }, { "epoch": 468.63, "learning_rate": 1.0627482128673551e-05, "loss": 2.8853, "step": 590000 }, { "epoch": 469.0, "eval_loss": 2.7328362464904785, "eval_runtime": 68.8254, "eval_samples_per_second": 6240.79, "eval_steps_per_second": 2.034, "step": 590471 }, { "epoch": 469.02, "learning_rate": 1.0619539316918191e-05, "loss": 2.886, "step": 590500 }, { "epoch": 469.42, "learning_rate": 1.0611596505162829e-05, "loss": 2.8835, "step": 591000 }, { "epoch": 469.82, "learning_rate": 1.0603653693407467e-05, "loss": 2.8846, "step": 591500 }, { "epoch": 470.0, "eval_loss": 2.737724542617798, "eval_runtime": 69.2954, "eval_samples_per_second": 6198.466, "eval_steps_per_second": 2.02, "step": 591730 }, { "epoch": 470.21, "learning_rate": 1.0595710881652105e-05, "loss": 2.8845, "step": 592000 }, { "epoch": 470.61, "learning_rate": 1.0587768069896745e-05, "loss": 2.8839, "step": 592500 }, { "epoch": 471.0, "eval_loss": 2.7345101833343506, "eval_runtime": 69.8165, "eval_samples_per_second": 6152.199, "eval_steps_per_second": 2.005, "step": 592989 }, { "epoch": 471.01, "learning_rate": 1.0579825258141383e-05, "loss": 2.8822, "step": 593000 }, { "epoch": 471.41, "learning_rate": 1.0571882446386021e-05, "loss": 2.8815, "step": 593500 }, { "epoch": 471.8, "learning_rate": 1.056393963463066e-05, "loss": 2.8837, "step": 594000 }, { "epoch": 472.0, "eval_loss": 2.735135078430176, "eval_runtime": 70.1587, "eval_samples_per_second": 6122.188, "eval_steps_per_second": 1.995, "step": 594248 }, { "epoch": 472.2, "learning_rate": 1.05559968228753e-05, "loss": 2.8831, "step": 594500 }, { "epoch": 472.6, "learning_rate": 1.0548054011119937e-05, "loss": 2.8817, "step": 595000 }, { "epoch": 472.99, "learning_rate": 1.0540111199364575e-05, "loss": 2.8846, "step": 595500 }, { "epoch": 473.0, "eval_loss": 2.7403037548065186, "eval_runtime": 69.6956, "eval_samples_per_second": 6162.873, "eval_steps_per_second": 2.009, "step": 595507 }, { "epoch": 473.39, "learning_rate": 1.0532168387609214e-05, "loss": 2.882, "step": 596000 }, { "epoch": 473.79, "learning_rate": 1.0524225575853853e-05, "loss": 2.8809, "step": 596500 }, { "epoch": 474.0, "eval_loss": 2.7303824424743652, "eval_runtime": 70.0189, "eval_samples_per_second": 6134.416, "eval_steps_per_second": 1.999, "step": 596766 }, { "epoch": 474.19, "learning_rate": 1.0516282764098491e-05, "loss": 2.8805, "step": 597000 }, { "epoch": 474.58, "learning_rate": 1.050833995234313e-05, "loss": 2.8831, "step": 597500 }, { "epoch": 474.98, "learning_rate": 1.0500397140587768e-05, "loss": 2.8819, "step": 598000 }, { "epoch": 475.0, "eval_loss": 2.7389371395111084, "eval_runtime": 67.5469, "eval_samples_per_second": 6358.918, "eval_steps_per_second": 2.073, "step": 598025 }, { "epoch": 475.38, "learning_rate": 1.0492454328832406e-05, "loss": 2.8792, "step": 598500 }, { "epoch": 475.77, "learning_rate": 1.0484511517077046e-05, "loss": 2.8815, "step": 599000 }, { "epoch": 476.0, "eval_loss": 2.7319533824920654, "eval_runtime": 67.2715, "eval_samples_per_second": 6384.951, "eval_steps_per_second": 2.081, "step": 599284 }, { "epoch": 476.17, "learning_rate": 1.0476568705321685e-05, "loss": 2.8793, "step": 599500 }, { "epoch": 476.57, "learning_rate": 1.0468625893566324e-05, "loss": 2.8818, "step": 600000 }, { "epoch": 476.97, "learning_rate": 1.0460683081810963e-05, "loss": 2.8802, "step": 600500 }, { "epoch": 477.0, "eval_loss": 2.730959892272949, "eval_runtime": 68.3844, "eval_samples_per_second": 6281.041, "eval_steps_per_second": 2.047, "step": 600543 }, { "epoch": 477.36, "learning_rate": 1.0452740270055601e-05, "loss": 2.8797, "step": 601000 }, { "epoch": 477.76, "learning_rate": 1.044479745830024e-05, "loss": 2.8823, "step": 601500 }, { "epoch": 478.0, "eval_loss": 2.7357118129730225, "eval_runtime": 70.5016, "eval_samples_per_second": 6092.415, "eval_steps_per_second": 1.986, "step": 601802 }, { "epoch": 478.16, "learning_rate": 1.0436854646544878e-05, "loss": 2.8812, "step": 602000 }, { "epoch": 478.55, "learning_rate": 1.0428911834789517e-05, "loss": 2.8805, "step": 602500 }, { "epoch": 478.95, "learning_rate": 1.0420969023034156e-05, "loss": 2.8786, "step": 603000 }, { "epoch": 479.0, "eval_loss": 2.7355384826660156, "eval_runtime": 67.7807, "eval_samples_per_second": 6336.98, "eval_steps_per_second": 2.065, "step": 603061 }, { "epoch": 479.35, "learning_rate": 1.0413026211278794e-05, "loss": 2.8803, "step": 603500 }, { "epoch": 479.75, "learning_rate": 1.0405083399523432e-05, "loss": 2.8778, "step": 604000 }, { "epoch": 480.0, "eval_loss": 2.7329230308532715, "eval_runtime": 68.476, "eval_samples_per_second": 6272.632, "eval_steps_per_second": 2.045, "step": 604320 }, { "epoch": 480.14, "learning_rate": 1.0397140587768072e-05, "loss": 2.8789, "step": 604500 }, { "epoch": 480.54, "learning_rate": 1.038919777601271e-05, "loss": 2.8788, "step": 605000 }, { "epoch": 480.94, "learning_rate": 1.0381254964257348e-05, "loss": 2.8807, "step": 605500 }, { "epoch": 481.0, "eval_loss": 2.728505849838257, "eval_runtime": 66.0982, "eval_samples_per_second": 6498.29, "eval_steps_per_second": 2.118, "step": 605579 }, { "epoch": 481.33, "learning_rate": 1.0373312152501986e-05, "loss": 2.8801, "step": 606000 }, { "epoch": 481.73, "learning_rate": 1.0365369340746626e-05, "loss": 2.8809, "step": 606500 }, { "epoch": 482.0, "eval_loss": 2.7332170009613037, "eval_runtime": 67.4579, "eval_samples_per_second": 6367.307, "eval_steps_per_second": 2.075, "step": 606838 }, { "epoch": 482.13, "learning_rate": 1.0357426528991264e-05, "loss": 2.8805, "step": 607000 }, { "epoch": 482.53, "learning_rate": 1.0349483717235902e-05, "loss": 2.8778, "step": 607500 }, { "epoch": 482.92, "learning_rate": 1.034154090548054e-05, "loss": 2.8774, "step": 608000 }, { "epoch": 483.0, "eval_loss": 2.732109308242798, "eval_runtime": 67.8254, "eval_samples_per_second": 6332.803, "eval_steps_per_second": 2.064, "step": 608097 }, { "epoch": 483.32, "learning_rate": 1.0333598093725178e-05, "loss": 2.8763, "step": 608500 }, { "epoch": 483.72, "learning_rate": 1.0325655281969818e-05, "loss": 2.8764, "step": 609000 }, { "epoch": 484.0, "eval_loss": 2.7280848026275635, "eval_runtime": 66.3583, "eval_samples_per_second": 6472.819, "eval_steps_per_second": 2.11, "step": 609356 }, { "epoch": 484.11, "learning_rate": 1.0317712470214456e-05, "loss": 2.8797, "step": 609500 }, { "epoch": 484.51, "learning_rate": 1.0309769658459094e-05, "loss": 2.8771, "step": 610000 }, { "epoch": 484.91, "learning_rate": 1.0301826846703732e-05, "loss": 2.877, "step": 610500 }, { "epoch": 485.0, "eval_loss": 2.7299296855926514, "eval_runtime": 66.5439, "eval_samples_per_second": 6454.765, "eval_steps_per_second": 2.104, "step": 610615 }, { "epoch": 485.31, "learning_rate": 1.0293884034948372e-05, "loss": 2.8787, "step": 611000 }, { "epoch": 485.7, "learning_rate": 1.0285941223193012e-05, "loss": 2.8762, "step": 611500 }, { "epoch": 486.0, "eval_loss": 2.732123374938965, "eval_runtime": 68.3413, "eval_samples_per_second": 6284.997, "eval_steps_per_second": 2.049, "step": 611874 }, { "epoch": 486.1, "learning_rate": 1.027799841143765e-05, "loss": 2.8763, "step": 612000 }, { "epoch": 486.5, "learning_rate": 1.027005559968229e-05, "loss": 2.8752, "step": 612500 }, { "epoch": 486.89, "learning_rate": 1.0262112787926928e-05, "loss": 2.8788, "step": 613000 }, { "epoch": 487.0, "eval_loss": 2.7287232875823975, "eval_runtime": 66.9916, "eval_samples_per_second": 6411.624, "eval_steps_per_second": 2.09, "step": 613133 }, { "epoch": 487.29, "learning_rate": 1.0254169976171566e-05, "loss": 2.8756, "step": 613500 }, { "epoch": 487.69, "learning_rate": 1.0246227164416204e-05, "loss": 2.8767, "step": 614000 }, { "epoch": 488.0, "eval_loss": 2.7332873344421387, "eval_runtime": 67.5497, "eval_samples_per_second": 6358.652, "eval_steps_per_second": 2.073, "step": 614392 }, { "epoch": 488.09, "learning_rate": 1.0238284352660844e-05, "loss": 2.879, "step": 614500 }, { "epoch": 488.48, "learning_rate": 1.0230341540905482e-05, "loss": 2.8762, "step": 615000 }, { "epoch": 488.88, "learning_rate": 1.022239872915012e-05, "loss": 2.8739, "step": 615500 }, { "epoch": 489.0, "eval_loss": 2.7251839637756348, "eval_runtime": 67.8784, "eval_samples_per_second": 6327.857, "eval_steps_per_second": 2.063, "step": 615651 }, { "epoch": 489.28, "learning_rate": 1.0214455917394758e-05, "loss": 2.877, "step": 616000 }, { "epoch": 489.67, "learning_rate": 1.0206513105639398e-05, "loss": 2.876, "step": 616500 }, { "epoch": 490.0, "eval_loss": 2.7319650650024414, "eval_runtime": 66.7454, "eval_samples_per_second": 6435.271, "eval_steps_per_second": 2.098, "step": 616910 }, { "epoch": 490.07, "learning_rate": 1.0198570293884036e-05, "loss": 2.8744, "step": 617000 }, { "epoch": 490.47, "learning_rate": 1.0190627482128674e-05, "loss": 2.8746, "step": 617500 }, { "epoch": 490.87, "learning_rate": 1.0182684670373312e-05, "loss": 2.8758, "step": 618000 }, { "epoch": 491.0, "eval_loss": 2.728564739227295, "eval_runtime": 67.2257, "eval_samples_per_second": 6389.3, "eval_steps_per_second": 2.083, "step": 618169 }, { "epoch": 491.26, "learning_rate": 1.0174741858617952e-05, "loss": 2.8754, "step": 618500 }, { "epoch": 491.66, "learning_rate": 1.016679904686259e-05, "loss": 2.8751, "step": 619000 }, { "epoch": 492.0, "eval_loss": 2.7330923080444336, "eval_runtime": 66.3222, "eval_samples_per_second": 6476.334, "eval_steps_per_second": 2.111, "step": 619428 }, { "epoch": 492.06, "learning_rate": 1.0158856235107228e-05, "loss": 2.8735, "step": 619500 }, { "epoch": 492.45, "learning_rate": 1.0150913423351867e-05, "loss": 2.8755, "step": 620000 } ], "max_steps": 1259000, "num_train_epochs": 1000, "total_flos": 2.9162740254912e+18, "trial_name": null, "trial_params": null }