{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.722424669879707, "eval_steps": 10000, "global_step": 380000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "learning_rate": 9.83937751004016e-06, "loss": 1.4799, "step": 10000 }, { "epoch": 0.1, "eval_loss": 1.0304539203643799, "eval_runtime": 69.6877, "eval_samples_per_second": 1434.973, "eval_steps_per_second": 14.952, "step": 10000 }, { "epoch": 0.2, "learning_rate": 9.638654618473896e-06, "loss": 0.9736, "step": 20000 }, { "epoch": 0.2, "eval_loss": 0.9039866328239441, "eval_runtime": 65.2654, "eval_samples_per_second": 1532.205, "eval_steps_per_second": 15.966, "step": 20000 }, { "epoch": 0.29, "learning_rate": 9.437951807228917e-06, "loss": 0.8922, "step": 30000 }, { "epoch": 0.29, "eval_loss": 0.8560834527015686, "eval_runtime": 64.4642, "eval_samples_per_second": 1551.248, "eval_steps_per_second": 16.164, "step": 30000 }, { "epoch": 0.39, "learning_rate": 9.237228915662652e-06, "loss": 0.8531, "step": 40000 }, { "epoch": 0.39, "eval_loss": 0.8226799368858337, "eval_runtime": 64.3868, "eval_samples_per_second": 1553.114, "eval_steps_per_second": 16.183, "step": 40000 }, { "epoch": 0.49, "learning_rate": 9.036526104417672e-06, "loss": 0.829, "step": 50000 }, { "epoch": 0.49, "eval_loss": 0.8042359948158264, "eval_runtime": 64.3396, "eval_samples_per_second": 1554.254, "eval_steps_per_second": 16.195, "step": 50000 }, { "epoch": 0.59, "learning_rate": 8.835823293172691e-06, "loss": 0.811, "step": 60000 }, { "epoch": 0.59, "eval_loss": 0.7950388789176941, "eval_runtime": 64.5291, "eval_samples_per_second": 1549.687, "eval_steps_per_second": 16.148, "step": 60000 }, { "epoch": 0.69, "learning_rate": 8.635120481927711e-06, "loss": 0.7976, "step": 70000 }, { "epoch": 0.69, "eval_loss": 0.7777369022369385, "eval_runtime": 64.862, "eval_samples_per_second": 1541.734, "eval_steps_per_second": 16.065, "step": 70000 }, { "epoch": 0.78, "learning_rate": 8.434417670682732e-06, "loss": 0.7872, "step": 80000 }, { "epoch": 0.78, "eval_loss": 0.7702258825302124, "eval_runtime": 64.4619, "eval_samples_per_second": 1551.303, "eval_steps_per_second": 16.165, "step": 80000 }, { "epoch": 0.88, "learning_rate": 8.233714859437752e-06, "loss": 0.778, "step": 90000 }, { "epoch": 0.88, "eval_loss": 0.760698139667511, "eval_runtime": 64.4008, "eval_samples_per_second": 1552.775, "eval_steps_per_second": 16.18, "step": 90000 }, { "epoch": 0.98, "learning_rate": 8.033012048192772e-06, "loss": 0.7695, "step": 100000 }, { "epoch": 0.98, "eval_loss": 0.7517786622047424, "eval_runtime": 64.56, "eval_samples_per_second": 1548.946, "eval_steps_per_second": 16.14, "step": 100000 }, { "epoch": 1.08, "learning_rate": 7.832309236947791e-06, "loss": 0.7638, "step": 110000 }, { "epoch": 1.08, "eval_loss": 0.7468145489692688, "eval_runtime": 64.9838, "eval_samples_per_second": 1538.846, "eval_steps_per_second": 16.035, "step": 110000 }, { "epoch": 1.18, "learning_rate": 7.631606425702813e-06, "loss": 0.7573, "step": 120000 }, { "epoch": 1.18, "eval_loss": 0.739628255367279, "eval_runtime": 64.2595, "eval_samples_per_second": 1556.191, "eval_steps_per_second": 16.216, "step": 120000 }, { "epoch": 1.27, "learning_rate": 7.4309036144578315e-06, "loss": 0.7512, "step": 130000 }, { "epoch": 1.27, "eval_loss": 0.7346537709236145, "eval_runtime": 64.911, "eval_samples_per_second": 1540.57, "eval_steps_per_second": 16.053, "step": 130000 }, { "epoch": 1.37, "learning_rate": 7.2301807228915665e-06, "loss": 0.7468, "step": 140000 }, { "epoch": 1.37, "eval_loss": 0.7300976514816284, "eval_runtime": 64.8445, "eval_samples_per_second": 1542.15, "eval_steps_per_second": 16.069, "step": 140000 }, { "epoch": 1.47, "learning_rate": 7.029477911646587e-06, "loss": 0.7418, "step": 150000 }, { "epoch": 1.47, "eval_loss": 0.7252103686332703, "eval_runtime": 64.5503, "eval_samples_per_second": 1549.179, "eval_steps_per_second": 16.142, "step": 150000 }, { "epoch": 1.57, "learning_rate": 6.828775100401607e-06, "loss": 0.7379, "step": 160000 }, { "epoch": 1.57, "eval_loss": 0.7203709483146667, "eval_runtime": 64.4763, "eval_samples_per_second": 1550.958, "eval_steps_per_second": 16.161, "step": 160000 }, { "epoch": 1.67, "learning_rate": 6.628072289156627e-06, "loss": 0.7333, "step": 170000 }, { "epoch": 1.67, "eval_loss": 0.7167079448699951, "eval_runtime": 64.7852, "eval_samples_per_second": 1543.562, "eval_steps_per_second": 16.084, "step": 170000 }, { "epoch": 1.76, "learning_rate": 6.427369477911647e-06, "loss": 0.7298, "step": 180000 }, { "epoch": 1.76, "eval_loss": 0.7129059433937073, "eval_runtime": 64.609, "eval_samples_per_second": 1547.772, "eval_steps_per_second": 16.128, "step": 180000 }, { "epoch": 1.86, "learning_rate": 6.2266666666666675e-06, "loss": 0.7258, "step": 190000 }, { "epoch": 1.86, "eval_loss": 0.7089965343475342, "eval_runtime": 64.6825, "eval_samples_per_second": 1546.013, "eval_steps_per_second": 16.109, "step": 190000 }, { "epoch": 1.96, "learning_rate": 6.025963855421687e-06, "loss": 0.7229, "step": 200000 }, { "epoch": 1.96, "eval_loss": 0.7047411799430847, "eval_runtime": 64.3501, "eval_samples_per_second": 1553.999, "eval_steps_per_second": 16.193, "step": 200000 }, { "epoch": 2.06, "learning_rate": 5.825261044176708e-06, "loss": 0.7194, "step": 210000 }, { "epoch": 2.06, "eval_loss": 0.7026636004447937, "eval_runtime": 64.4785, "eval_samples_per_second": 1550.905, "eval_steps_per_second": 16.16, "step": 210000 }, { "epoch": 2.16, "learning_rate": 5.624558232931727e-06, "loss": 0.7171, "step": 220000 }, { "epoch": 2.16, "eval_loss": 0.702177107334137, "eval_runtime": 64.5209, "eval_samples_per_second": 1549.886, "eval_steps_per_second": 16.15, "step": 220000 }, { "epoch": 2.25, "learning_rate": 5.423855421686748e-06, "loss": 0.7138, "step": 230000 }, { "epoch": 2.25, "eval_loss": 0.6978575587272644, "eval_runtime": 64.573, "eval_samples_per_second": 1548.634, "eval_steps_per_second": 16.137, "step": 230000 }, { "epoch": 2.35, "learning_rate": 5.2231526104417676e-06, "loss": 0.7113, "step": 240000 }, { "epoch": 2.35, "eval_loss": 0.6939824819564819, "eval_runtime": 64.7414, "eval_samples_per_second": 1544.606, "eval_steps_per_second": 16.095, "step": 240000 }, { "epoch": 2.45, "learning_rate": 5.022449799196788e-06, "loss": 0.7087, "step": 250000 }, { "epoch": 2.45, "eval_loss": 0.6922757029533386, "eval_runtime": 64.5094, "eval_samples_per_second": 1550.162, "eval_steps_per_second": 16.153, "step": 250000 }, { "epoch": 2.55, "learning_rate": 4.821746987951808e-06, "loss": 0.7071, "step": 260000 }, { "epoch": 2.55, "eval_loss": 0.6915081739425659, "eval_runtime": 65.0605, "eval_samples_per_second": 1537.031, "eval_steps_per_second": 16.016, "step": 260000 }, { "epoch": 2.64, "learning_rate": 4.6210441767068274e-06, "loss": 0.7047, "step": 270000 }, { "epoch": 2.64, "eval_loss": 0.6870027184486389, "eval_runtime": 64.7199, "eval_samples_per_second": 1545.119, "eval_steps_per_second": 16.1, "step": 270000 }, { "epoch": 2.74, "learning_rate": 4.420341365461848e-06, "loss": 0.7027, "step": 280000 }, { "epoch": 2.74, "eval_loss": 0.6844470500946045, "eval_runtime": 64.906, "eval_samples_per_second": 1540.69, "eval_steps_per_second": 16.054, "step": 280000 }, { "epoch": 2.84, "learning_rate": 4.219638554216868e-06, "loss": 0.6999, "step": 290000 }, { "epoch": 2.84, "eval_loss": 0.6833365559577942, "eval_runtime": 64.5739, "eval_samples_per_second": 1548.614, "eval_steps_per_second": 16.137, "step": 290000 }, { "epoch": 2.94, "learning_rate": 4.018935742971888e-06, "loss": 0.6992, "step": 300000 }, { "epoch": 2.94, "eval_loss": 0.6834575533866882, "eval_runtime": 64.6706, "eval_samples_per_second": 1546.298, "eval_steps_per_second": 16.112, "step": 300000 }, { "epoch": 3.04, "learning_rate": 3.818253012048193e-06, "loss": 0.6975, "step": 310000 }, { "epoch": 3.04, "eval_loss": 0.6793897151947021, "eval_runtime": 64.8495, "eval_samples_per_second": 1542.031, "eval_steps_per_second": 16.068, "step": 310000 }, { "epoch": 3.13, "learning_rate": 3.617530120481928e-06, "loss": 0.6947, "step": 320000 }, { "epoch": 3.13, "eval_loss": 0.6794592142105103, "eval_runtime": 64.6554, "eval_samples_per_second": 1546.661, "eval_steps_per_second": 16.116, "step": 320000 }, { "epoch": 3.23, "learning_rate": 3.416827309236948e-06, "loss": 0.6947, "step": 330000 }, { "epoch": 3.23, "eval_loss": 0.6781629920005798, "eval_runtime": 65.93, "eval_samples_per_second": 1516.76, "eval_steps_per_second": 15.805, "step": 330000 }, { "epoch": 3.33, "learning_rate": 3.216124497991968e-06, "loss": 0.6932, "step": 340000 }, { "epoch": 3.33, "eval_loss": 0.6771513819694519, "eval_runtime": 64.7971, "eval_samples_per_second": 1543.279, "eval_steps_per_second": 16.081, "step": 340000 }, { "epoch": 3.43, "learning_rate": 3.015401606425703e-06, "loss": 0.6914, "step": 350000 }, { "epoch": 3.43, "eval_loss": 0.6757428050041199, "eval_runtime": 64.8856, "eval_samples_per_second": 1541.173, "eval_steps_per_second": 16.059, "step": 350000 }, { "epoch": 3.53, "learning_rate": 2.8146987951807233e-06, "loss": 0.6907, "step": 360000 }, { "epoch": 3.53, "eval_loss": 0.6730121374130249, "eval_runtime": 65.1098, "eval_samples_per_second": 1535.867, "eval_steps_per_second": 16.004, "step": 360000 }, { "epoch": 3.62, "learning_rate": 2.6139959839357434e-06, "loss": 0.689, "step": 370000 }, { "epoch": 3.62, "eval_loss": 0.6724720597267151, "eval_runtime": 65.1981, "eval_samples_per_second": 1533.787, "eval_steps_per_second": 15.982, "step": 370000 }, { "epoch": 3.72, "learning_rate": 2.413293172690763e-06, "loss": 0.6885, "step": 380000 }, { "epoch": 3.72, "eval_loss": 0.6744215488433838, "eval_runtime": 64.9859, "eval_samples_per_second": 1538.796, "eval_steps_per_second": 16.034, "step": 380000 } ], "logging_steps": 10000, "max_steps": 500000, "num_train_epochs": 5, "save_steps": 10000, "total_flos": 6.301004166565632e+17, "trial_name": null, "trial_params": null }