{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "global_step": 76416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 2.9803706030150755e-05, "loss": 1.2294, "step": 500 }, { "epoch": 0.1, "learning_rate": 2.960741206030151e-05, "loss": 0.9625, "step": 1000 }, { "epoch": 0.16, "learning_rate": 2.9411118090452263e-05, "loss": 0.9145, "step": 1500 }, { "epoch": 0.21, "learning_rate": 2.9214824120603013e-05, "loss": 0.8595, "step": 2000 }, { "epoch": 0.26, "learning_rate": 2.901853015075377e-05, "loss": 0.8622, "step": 2500 }, { "epoch": 0.31, "learning_rate": 2.8822236180904525e-05, "loss": 0.8504, "step": 3000 }, { "epoch": 0.37, "learning_rate": 2.8625942211055275e-05, "loss": 0.8308, "step": 3500 }, { "epoch": 0.42, "learning_rate": 2.842964824120603e-05, "loss": 0.8292, "step": 4000 }, { "epoch": 0.47, "learning_rate": 2.8233354271356787e-05, "loss": 0.8085, "step": 4500 }, { "epoch": 0.52, "learning_rate": 2.8037060301507537e-05, "loss": 0.7914, "step": 5000 }, { "epoch": 0.58, "learning_rate": 2.784076633165829e-05, "loss": 0.8014, "step": 5500 }, { "epoch": 0.63, "learning_rate": 2.7644472361809045e-05, "loss": 0.7876, "step": 6000 }, { "epoch": 0.68, "learning_rate": 2.74481783919598e-05, "loss": 0.7936, "step": 6500 }, { "epoch": 0.73, "learning_rate": 2.7251884422110553e-05, "loss": 0.7975, "step": 7000 }, { "epoch": 0.79, "learning_rate": 2.7055590452261307e-05, "loss": 0.7748, "step": 7500 }, { "epoch": 0.84, "learning_rate": 2.685929648241206e-05, "loss": 0.7804, "step": 8000 }, { "epoch": 0.89, "learning_rate": 2.6663002512562815e-05, "loss": 0.773, "step": 8500 }, { "epoch": 0.94, "learning_rate": 2.646670854271357e-05, "loss": 0.7652, "step": 9000 }, { "epoch": 0.99, "learning_rate": 2.6270414572864323e-05, "loss": 0.7727, "step": 9500 }, { "epoch": 1.0, "eval_loss": 0.8350390195846558, "eval_runtime": 8.4552, "eval_samples_per_second": 1312.323, "eval_steps_per_second": 41.04, "step": 9552 }, { "epoch": 1.05, "learning_rate": 2.6074120603015074e-05, "loss": 0.7397, "step": 10000 }, { "epoch": 1.1, "learning_rate": 2.587782663316583e-05, "loss": 0.7335, "step": 10500 }, { "epoch": 1.15, "learning_rate": 2.5681532663316585e-05, "loss": 0.7095, "step": 11000 }, { "epoch": 1.2, "learning_rate": 2.5485238693467336e-05, "loss": 0.7407, "step": 11500 }, { "epoch": 1.26, "learning_rate": 2.528894472361809e-05, "loss": 0.7235, "step": 12000 }, { "epoch": 1.31, "learning_rate": 2.5092650753768844e-05, "loss": 0.709, "step": 12500 }, { "epoch": 1.36, "learning_rate": 2.4896356783919598e-05, "loss": 0.7161, "step": 13000 }, { "epoch": 1.41, "learning_rate": 2.4700062814070352e-05, "loss": 0.7278, "step": 13500 }, { "epoch": 1.47, "learning_rate": 2.4503768844221106e-05, "loss": 0.74, "step": 14000 }, { "epoch": 1.52, "learning_rate": 2.430747487437186e-05, "loss": 0.7323, "step": 14500 }, { "epoch": 1.57, "learning_rate": 2.4111180904522614e-05, "loss": 0.7298, "step": 15000 }, { "epoch": 1.62, "learning_rate": 2.3914886934673368e-05, "loss": 0.7394, "step": 15500 }, { "epoch": 1.68, "learning_rate": 2.3718592964824122e-05, "loss": 0.7393, "step": 16000 }, { "epoch": 1.73, "learning_rate": 2.3522298994974873e-05, "loss": 0.715, "step": 16500 }, { "epoch": 1.78, "learning_rate": 2.332600502512563e-05, "loss": 0.7139, "step": 17000 }, { "epoch": 1.83, "learning_rate": 2.3129711055276384e-05, "loss": 0.7215, "step": 17500 }, { "epoch": 1.88, "learning_rate": 2.2933417085427135e-05, "loss": 0.7122, "step": 18000 }, { "epoch": 1.94, "learning_rate": 2.273712311557789e-05, "loss": 0.7119, "step": 18500 }, { "epoch": 1.99, "learning_rate": 2.2540829145728646e-05, "loss": 0.7113, "step": 19000 }, { "epoch": 2.0, "eval_loss": 0.8123675584793091, "eval_runtime": 8.2517, "eval_samples_per_second": 1344.695, "eval_steps_per_second": 42.052, "step": 19104 }, { "epoch": 2.04, "learning_rate": 2.2344535175879397e-05, "loss": 0.6898, "step": 19500 }, { "epoch": 2.09, "learning_rate": 2.214824120603015e-05, "loss": 0.6819, "step": 20000 }, { "epoch": 2.15, "learning_rate": 2.1951947236180905e-05, "loss": 0.6797, "step": 20500 }, { "epoch": 2.2, "learning_rate": 2.175565326633166e-05, "loss": 0.6808, "step": 21000 }, { "epoch": 2.25, "learning_rate": 2.1559359296482413e-05, "loss": 0.6949, "step": 21500 }, { "epoch": 2.3, "learning_rate": 2.1363065326633167e-05, "loss": 0.689, "step": 22000 }, { "epoch": 2.36, "learning_rate": 2.116677135678392e-05, "loss": 0.6993, "step": 22500 }, { "epoch": 2.41, "learning_rate": 2.0970477386934675e-05, "loss": 0.6795, "step": 23000 }, { "epoch": 2.46, "learning_rate": 2.077418341708543e-05, "loss": 0.6972, "step": 23500 }, { "epoch": 2.51, "learning_rate": 2.0577889447236183e-05, "loss": 0.6831, "step": 24000 }, { "epoch": 2.56, "learning_rate": 2.0381595477386933e-05, "loss": 0.678, "step": 24500 }, { "epoch": 2.62, "learning_rate": 2.0185301507537687e-05, "loss": 0.6871, "step": 25000 }, { "epoch": 2.67, "learning_rate": 1.9989007537688445e-05, "loss": 0.6793, "step": 25500 }, { "epoch": 2.72, "learning_rate": 1.9792713567839195e-05, "loss": 0.6808, "step": 26000 }, { "epoch": 2.77, "learning_rate": 1.959641959798995e-05, "loss": 0.6894, "step": 26500 }, { "epoch": 2.83, "learning_rate": 1.9400125628140703e-05, "loss": 0.6876, "step": 27000 }, { "epoch": 2.88, "learning_rate": 1.9203831658291457e-05, "loss": 0.6919, "step": 27500 }, { "epoch": 2.93, "learning_rate": 1.900753768844221e-05, "loss": 0.6856, "step": 28000 }, { "epoch": 2.98, "learning_rate": 1.8811243718592965e-05, "loss": 0.6708, "step": 28500 }, { "epoch": 3.0, "eval_loss": 0.7998350262641907, "eval_runtime": 8.2332, "eval_samples_per_second": 1347.713, "eval_steps_per_second": 42.146, "step": 28656 }, { "epoch": 3.04, "learning_rate": 1.861494974874372e-05, "loss": 0.6758, "step": 29000 }, { "epoch": 3.09, "learning_rate": 1.8418655778894473e-05, "loss": 0.6638, "step": 29500 }, { "epoch": 3.14, "learning_rate": 1.8222361809045227e-05, "loss": 0.654, "step": 30000 }, { "epoch": 3.19, "learning_rate": 1.802606783919598e-05, "loss": 0.6792, "step": 30500 }, { "epoch": 3.25, "learning_rate": 1.7829773869346732e-05, "loss": 0.6999, "step": 31000 }, { "epoch": 3.3, "learning_rate": 1.763347989949749e-05, "loss": 0.6834, "step": 31500 }, { "epoch": 3.35, "learning_rate": 1.7437185929648243e-05, "loss": 0.647, "step": 32000 }, { "epoch": 3.4, "learning_rate": 1.7240891959798994e-05, "loss": 0.662, "step": 32500 }, { "epoch": 3.45, "learning_rate": 1.7044597989949748e-05, "loss": 0.6674, "step": 33000 }, { "epoch": 3.51, "learning_rate": 1.6848304020100505e-05, "loss": 0.6409, "step": 33500 }, { "epoch": 3.56, "learning_rate": 1.6652010050251256e-05, "loss": 0.6766, "step": 34000 }, { "epoch": 3.61, "learning_rate": 1.645571608040201e-05, "loss": 0.6658, "step": 34500 }, { "epoch": 3.66, "learning_rate": 1.6259422110552764e-05, "loss": 0.6765, "step": 35000 }, { "epoch": 3.72, "learning_rate": 1.6063128140703518e-05, "loss": 0.6478, "step": 35500 }, { "epoch": 3.77, "learning_rate": 1.5866834170854272e-05, "loss": 0.6604, "step": 36000 }, { "epoch": 3.82, "learning_rate": 1.5670540201005026e-05, "loss": 0.6398, "step": 36500 }, { "epoch": 3.87, "learning_rate": 1.547424623115578e-05, "loss": 0.6554, "step": 37000 }, { "epoch": 3.93, "learning_rate": 1.527795226130653e-05, "loss": 0.6442, "step": 37500 }, { "epoch": 3.98, "learning_rate": 1.5081658291457286e-05, "loss": 0.6552, "step": 38000 }, { "epoch": 4.0, "eval_loss": 0.793069064617157, "eval_runtime": 8.0925, "eval_samples_per_second": 1371.141, "eval_steps_per_second": 42.879, "step": 38208 }, { "epoch": 4.03, "learning_rate": 1.488536432160804e-05, "loss": 0.6526, "step": 38500 }, { "epoch": 4.08, "learning_rate": 1.4689070351758794e-05, "loss": 0.656, "step": 39000 }, { "epoch": 4.14, "learning_rate": 1.4492776381909548e-05, "loss": 0.6499, "step": 39500 }, { "epoch": 4.19, "learning_rate": 1.4296482412060302e-05, "loss": 0.6319, "step": 40000 }, { "epoch": 4.24, "learning_rate": 1.4100188442211055e-05, "loss": 0.6511, "step": 40500 }, { "epoch": 4.29, "learning_rate": 1.390389447236181e-05, "loss": 0.6461, "step": 41000 }, { "epoch": 4.34, "learning_rate": 1.3707600502512563e-05, "loss": 0.6472, "step": 41500 }, { "epoch": 4.4, "learning_rate": 1.3511306532663317e-05, "loss": 0.6334, "step": 42000 }, { "epoch": 4.45, "learning_rate": 1.331501256281407e-05, "loss": 0.6302, "step": 42500 }, { "epoch": 4.5, "learning_rate": 1.3118718592964825e-05, "loss": 0.6584, "step": 43000 }, { "epoch": 4.55, "learning_rate": 1.2922424623115579e-05, "loss": 0.635, "step": 43500 }, { "epoch": 4.61, "learning_rate": 1.2726130653266331e-05, "loss": 0.6305, "step": 44000 }, { "epoch": 4.66, "learning_rate": 1.2529836683417085e-05, "loss": 0.6448, "step": 44500 }, { "epoch": 4.71, "learning_rate": 1.233354271356784e-05, "loss": 0.6546, "step": 45000 }, { "epoch": 4.76, "learning_rate": 1.2137248743718593e-05, "loss": 0.6426, "step": 45500 }, { "epoch": 4.82, "learning_rate": 1.1940954773869347e-05, "loss": 0.6405, "step": 46000 }, { "epoch": 4.87, "learning_rate": 1.1744660804020101e-05, "loss": 0.6207, "step": 46500 }, { "epoch": 4.92, "learning_rate": 1.1548366834170855e-05, "loss": 0.6319, "step": 47000 }, { "epoch": 4.97, "learning_rate": 1.1352072864321609e-05, "loss": 0.6419, "step": 47500 }, { "epoch": 5.0, "eval_loss": 0.7893060445785522, "eval_runtime": 8.4012, "eval_samples_per_second": 1320.763, "eval_steps_per_second": 41.304, "step": 47760 }, { "epoch": 5.03, "learning_rate": 1.1155778894472361e-05, "loss": 0.6382, "step": 48000 }, { "epoch": 5.08, "learning_rate": 1.0959484924623115e-05, "loss": 0.6264, "step": 48500 }, { "epoch": 5.13, "learning_rate": 1.076319095477387e-05, "loss": 0.627, "step": 49000 }, { "epoch": 5.18, "learning_rate": 1.0566896984924623e-05, "loss": 0.6284, "step": 49500 }, { "epoch": 5.23, "learning_rate": 1.0370603015075377e-05, "loss": 0.6256, "step": 50000 }, { "epoch": 5.29, "learning_rate": 1.0174309045226131e-05, "loss": 0.6423, "step": 50500 }, { "epoch": 5.34, "learning_rate": 9.978015075376884e-06, "loss": 0.634, "step": 51000 }, { "epoch": 5.39, "learning_rate": 9.78172110552764e-06, "loss": 0.6393, "step": 51500 }, { "epoch": 5.44, "learning_rate": 9.585427135678392e-06, "loss": 0.6259, "step": 52000 }, { "epoch": 5.5, "learning_rate": 9.389133165829146e-06, "loss": 0.628, "step": 52500 }, { "epoch": 5.55, "learning_rate": 9.1928391959799e-06, "loss": 0.6337, "step": 53000 }, { "epoch": 5.6, "learning_rate": 8.996545226130654e-06, "loss": 0.6128, "step": 53500 }, { "epoch": 5.65, "learning_rate": 8.800251256281408e-06, "loss": 0.6276, "step": 54000 }, { "epoch": 5.71, "learning_rate": 8.60395728643216e-06, "loss": 0.629, "step": 54500 }, { "epoch": 5.76, "learning_rate": 8.407663316582914e-06, "loss": 0.6261, "step": 55000 }, { "epoch": 5.81, "learning_rate": 8.21136934673367e-06, "loss": 0.6137, "step": 55500 }, { "epoch": 5.86, "learning_rate": 8.015075376884422e-06, "loss": 0.6353, "step": 56000 }, { "epoch": 5.91, "learning_rate": 7.818781407035176e-06, "loss": 0.6239, "step": 56500 }, { "epoch": 5.97, "learning_rate": 7.622487437185929e-06, "loss": 0.618, "step": 57000 }, { "epoch": 6.0, "eval_loss": 0.7882150411605835, "eval_runtime": 8.1734, "eval_samples_per_second": 1357.582, "eval_steps_per_second": 42.455, "step": 57312 }, { "epoch": 6.02, "learning_rate": 7.426193467336683e-06, "loss": 0.6427, "step": 57500 }, { "epoch": 6.07, "learning_rate": 7.229899497487438e-06, "loss": 0.6283, "step": 58000 }, { "epoch": 6.12, "learning_rate": 7.033605527638191e-06, "loss": 0.6277, "step": 58500 }, { "epoch": 6.18, "learning_rate": 6.837311557788945e-06, "loss": 0.6131, "step": 59000 }, { "epoch": 6.23, "learning_rate": 6.641017587939698e-06, "loss": 0.6086, "step": 59500 }, { "epoch": 6.28, "learning_rate": 6.444723618090452e-06, "loss": 0.6154, "step": 60000 }, { "epoch": 6.33, "learning_rate": 6.248429648241206e-06, "loss": 0.6184, "step": 60500 }, { "epoch": 6.39, "learning_rate": 6.0521356783919595e-06, "loss": 0.6139, "step": 61000 }, { "epoch": 6.44, "learning_rate": 5.8558417085427135e-06, "loss": 0.6148, "step": 61500 }, { "epoch": 6.49, "learning_rate": 5.6595477386934675e-06, "loss": 0.6256, "step": 62000 }, { "epoch": 6.54, "learning_rate": 5.4632537688442215e-06, "loss": 0.6135, "step": 62500 }, { "epoch": 6.6, "learning_rate": 5.266959798994975e-06, "loss": 0.6274, "step": 63000 }, { "epoch": 6.65, "learning_rate": 5.070665829145729e-06, "loss": 0.623, "step": 63500 }, { "epoch": 6.7, "learning_rate": 4.874371859296483e-06, "loss": 0.6254, "step": 64000 }, { "epoch": 6.75, "learning_rate": 4.678077889447237e-06, "loss": 0.6233, "step": 64500 }, { "epoch": 6.8, "learning_rate": 4.48178391959799e-06, "loss": 0.6124, "step": 65000 }, { "epoch": 6.86, "learning_rate": 4.285489949748744e-06, "loss": 0.6108, "step": 65500 }, { "epoch": 6.91, "learning_rate": 4.089195979899497e-06, "loss": 0.6182, "step": 66000 }, { "epoch": 6.96, "learning_rate": 3.892902010050252e-06, "loss": 0.5991, "step": 66500 }, { "epoch": 7.0, "eval_loss": 0.7896326184272766, "eval_runtime": 8.35, "eval_samples_per_second": 1328.855, "eval_steps_per_second": 41.557, "step": 66864 }, { "epoch": 7.01, "learning_rate": 3.696608040201005e-06, "loss": 0.6139, "step": 67000 }, { "epoch": 7.07, "learning_rate": 3.5003140703517586e-06, "loss": 0.6092, "step": 67500 }, { "epoch": 7.12, "learning_rate": 3.3040201005025126e-06, "loss": 0.6021, "step": 68000 }, { "epoch": 7.17, "learning_rate": 3.107726130653266e-06, "loss": 0.6138, "step": 68500 }, { "epoch": 7.22, "learning_rate": 2.91143216080402e-06, "loss": 0.6279, "step": 69000 }, { "epoch": 7.28, "learning_rate": 2.7151381909547737e-06, "loss": 0.6086, "step": 69500 }, { "epoch": 7.33, "learning_rate": 2.5188442211055277e-06, "loss": 0.6108, "step": 70000 }, { "epoch": 7.38, "learning_rate": 2.3225502512562813e-06, "loss": 0.5933, "step": 70500 }, { "epoch": 7.43, "learning_rate": 2.1262562814070353e-06, "loss": 0.6089, "step": 71000 }, { "epoch": 7.49, "learning_rate": 1.929962311557789e-06, "loss": 0.6086, "step": 71500 }, { "epoch": 7.54, "learning_rate": 1.7336683417085427e-06, "loss": 0.622, "step": 72000 }, { "epoch": 7.59, "learning_rate": 1.5373743718592965e-06, "loss": 0.6041, "step": 72500 }, { "epoch": 7.64, "learning_rate": 1.3410804020100503e-06, "loss": 0.6146, "step": 73000 }, { "epoch": 7.69, "learning_rate": 1.144786432160804e-06, "loss": 0.5999, "step": 73500 }, { "epoch": 7.75, "learning_rate": 9.484924623115579e-07, "loss": 0.6094, "step": 74000 }, { "epoch": 7.8, "learning_rate": 7.521984924623115e-07, "loss": 0.6137, "step": 74500 }, { "epoch": 7.85, "learning_rate": 5.559045226130653e-07, "loss": 0.6065, "step": 75000 }, { "epoch": 7.9, "learning_rate": 3.5961055276381907e-07, "loss": 0.6149, "step": 75500 }, { "epoch": 7.96, "learning_rate": 1.6331658291457286e-07, "loss": 0.6159, "step": 76000 }, { "epoch": 8.0, "eval_loss": 0.7893399000167847, "eval_runtime": 8.9446, "eval_samples_per_second": 1240.522, "eval_steps_per_second": 38.794, "step": 76416 } ], "max_steps": 76416, "num_train_epochs": 8, "total_flos": 2.326706308251648e+16, "trial_name": null, "trial_params": null }