{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.997545409916544, "eval_steps": 500, "global_step": 127, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": "0.0000e+00", "loss": 3.1086, "slid_loss": 3.1086, "step": 1, "time": 93.96 }, { "epoch": 0.02, "learning_rate": "5.0000e-06", "loss": 2.8339, "slid_loss": 2.9713, "step": 2, "time": 84.7 }, { "epoch": 0.02, "learning_rate": "5.0000e-06", "loss": 2.8656, "slid_loss": 2.936, "step": 3, "time": 79.97 }, { "epoch": 0.03, "learning_rate": "5.0000e-06", "loss": 2.5524, "slid_loss": 2.8401, "step": 4, "time": 76.7 }, { "epoch": 0.04, "learning_rate": "5.0000e-06", "loss": 2.5858, "slid_loss": 2.7892, "step": 5, "time": 81.12 }, { "epoch": 0.05, "learning_rate": "5.0000e-06", "loss": 2.0771, "slid_loss": 2.6706, "step": 6, "time": 78.35 }, { "epoch": 0.05, "learning_rate": "5.0000e-06", "loss": 2.0975, "slid_loss": 2.5887, "step": 7, "time": 81.78 }, { "epoch": 0.06, "learning_rate": "5.0000e-06", "loss": 1.9181, "slid_loss": 2.5049, "step": 8, "time": 81.35 }, { "epoch": 0.07, "learning_rate": "5.0000e-06", "loss": 1.9339, "slid_loss": 2.4414, "step": 9, "time": 83.78 }, { "epoch": 0.08, "learning_rate": "5.0000e-06", "loss": 1.8134, "slid_loss": 2.3786, "step": 10, "time": 79.06 }, { "epoch": 0.09, "learning_rate": "5.0000e-06", "loss": 1.8346, "slid_loss": 2.3292, "step": 11, "time": 80.19 }, { "epoch": 0.09, "learning_rate": "5.0000e-06", "loss": 1.801, "slid_loss": 2.2851, "step": 12, "time": 87.7 }, { "epoch": 0.1, "learning_rate": "5.0000e-06", "loss": 1.8712, "slid_loss": 2.2533, "step": 13, "time": 81.42 }, { "epoch": 0.11, "learning_rate": "5.0000e-06", "loss": 1.8017, "slid_loss": 2.221, "step": 14, "time": 81.39 }, { "epoch": 0.12, "learning_rate": "5.0000e-06", "loss": 1.7378, "slid_loss": 2.1888, "step": 15, "time": 80.29 }, { "epoch": 0.13, "learning_rate": "5.0000e-06", "loss": 1.9118, "slid_loss": 2.1715, "step": 16, "time": 83.04 }, { "epoch": 0.13, "learning_rate": "5.0000e-06", "loss": 1.7942, "slid_loss": 2.1493, "step": 17, "time": 75.98 }, { "epoch": 0.14, "learning_rate": "5.0000e-06", "loss": 1.7516, "slid_loss": 2.1272, "step": 18, "time": 88.91 }, { "epoch": 0.15, "learning_rate": "5.0000e-06", "loss": 1.8129, "slid_loss": 2.1107, "step": 19, "time": 80.76 }, { "epoch": 0.16, "learning_rate": "5.0000e-06", "loss": 1.7673, "slid_loss": 2.0935, "step": 20, "time": 81.28 }, { "epoch": 0.16, "learning_rate": "5.0000e-06", "loss": 1.7984, "slid_loss": 2.0795, "step": 21, "time": 85.37 }, { "epoch": 0.17, "learning_rate": "5.0000e-06", "loss": 1.7733, "slid_loss": 2.0655, "step": 22, "time": 80.72 }, { "epoch": 0.18, "learning_rate": "5.0000e-06", "loss": 1.7691, "slid_loss": 2.0527, "step": 23, "time": 80.11 }, { "epoch": 0.19, "learning_rate": "5.0000e-06", "loss": 1.8212, "slid_loss": 2.043, "step": 24, "time": 81.83 }, { "epoch": 0.2, "learning_rate": "5.0000e-06", "loss": 1.9154, "slid_loss": 2.0379, "step": 25, "time": 78.48 }, { "epoch": 0.2, "learning_rate": "5.0000e-06", "loss": 1.8267, "slid_loss": 2.0298, "step": 26, "time": 80.38 }, { "epoch": 0.21, "learning_rate": "5.0000e-06", "loss": 1.9029, "slid_loss": 2.0251, "step": 27, "time": 84.88 }, { "epoch": 0.22, "learning_rate": "5.0000e-06", "loss": 1.8892, "slid_loss": 2.0202, "step": 28, "time": 81.13 }, { "epoch": 0.23, "learning_rate": "5.0000e-06", "loss": 1.7599, "slid_loss": 2.0112, "step": 29, "time": 83.73 }, { "epoch": 0.24, "learning_rate": "5.0000e-06", "loss": 1.7919, "slid_loss": 2.0039, "step": 30, "time": 82.44 }, { "epoch": 0.24, "learning_rate": "5.0000e-06", "loss": 1.8086, "slid_loss": 1.9976, "step": 31, "time": 83.46 }, { "epoch": 0.25, "learning_rate": "5.0000e-06", "loss": 1.8349, "slid_loss": 1.9925, "step": 32, "time": 78.89 }, { "epoch": 0.26, "learning_rate": "5.0000e-06", "loss": 1.8263, "slid_loss": 1.9875, "step": 33, "time": 79.08 }, { "epoch": 0.27, "learning_rate": "5.0000e-06", "loss": 1.9287, "slid_loss": 1.9858, "step": 34, "time": 81.76 }, { "epoch": 0.27, "learning_rate": "5.0000e-06", "loss": 1.785, "slid_loss": 1.98, "step": 35, "time": 78.4 }, { "epoch": 0.28, "learning_rate": "5.0000e-06", "loss": 1.8191, "slid_loss": 1.9756, "step": 36, "time": 77.74 }, { "epoch": 0.29, "learning_rate": "5.0000e-06", "loss": 1.8219, "slid_loss": 1.9714, "step": 37, "time": 86.24 }, { "epoch": 0.3, "learning_rate": "5.0000e-06", "loss": 1.8075, "slid_loss": 1.9671, "step": 38, "time": 76.73 }, { "epoch": 0.31, "learning_rate": "5.0000e-06", "loss": 1.7785, "slid_loss": 1.9623, "step": 39, "time": 80.96 }, { "epoch": 0.31, "learning_rate": "5.0000e-06", "loss": 1.8296, "slid_loss": 1.959, "step": 40, "time": 83.93 }, { "epoch": 0.32, "learning_rate": "5.0000e-06", "loss": 1.7834, "slid_loss": 1.9547, "step": 41, "time": 77.98 }, { "epoch": 0.33, "learning_rate": "5.0000e-06", "loss": 1.7894, "slid_loss": 1.9507, "step": 42, "time": 82.3 }, { "epoch": 0.34, "learning_rate": "5.0000e-06", "loss": 1.8013, "slid_loss": 1.9473, "step": 43, "time": 84.45 }, { "epoch": 0.35, "learning_rate": "5.0000e-06", "loss": 1.7882, "slid_loss": 1.9436, "step": 44, "time": 78.67 }, { "epoch": 0.35, "learning_rate": "5.0000e-06", "loss": 1.7633, "slid_loss": 1.9396, "step": 45, "time": 79.58 }, { "epoch": 0.36, "learning_rate": "5.0000e-06", "loss": 1.8443, "slid_loss": 1.9376, "step": 46, "time": 79.13 }, { "epoch": 0.37, "learning_rate": "5.0000e-06", "loss": 1.7184, "slid_loss": 1.9329, "step": 47, "time": 78.73 }, { "epoch": 0.38, "learning_rate": "5.0000e-06", "loss": 1.7869, "slid_loss": 1.9299, "step": 48, "time": 77.96 }, { "epoch": 0.38, "learning_rate": "5.0000e-06", "loss": 1.8581, "slid_loss": 1.9284, "step": 49, "time": 84.5 }, { "epoch": 0.39, "learning_rate": "5.0000e-06", "loss": 1.7501, "slid_loss": 1.9248, "step": 50, "time": 79.17 }, { "epoch": 0.4, "learning_rate": "5.0000e-06", "loss": 1.7825, "slid_loss": 1.922, "step": 51, "time": 199.7 }, { "epoch": 0.41, "learning_rate": "5.0000e-06", "loss": 1.863, "slid_loss": 1.9209, "step": 52, "time": 81.1 }, { "epoch": 0.42, "learning_rate": "5.0000e-06", "loss": 1.8452, "slid_loss": 1.9195, "step": 53, "time": 77.93 }, { "epoch": 0.42, "learning_rate": "5.0000e-06", "loss": 1.8031, "slid_loss": 1.9173, "step": 54, "time": 78.92 }, { "epoch": 0.43, "learning_rate": "5.0000e-06", "loss": 1.7708, "slid_loss": 1.9147, "step": 55, "time": 77.85 }, { "epoch": 0.44, "learning_rate": "5.0000e-06", "loss": 1.7764, "slid_loss": 1.9122, "step": 56, "time": 76.81 }, { "epoch": 0.45, "learning_rate": "5.0000e-06", "loss": 1.7854, "slid_loss": 1.91, "step": 57, "time": 80.48 }, { "epoch": 0.46, "learning_rate": "5.0000e-06", "loss": 1.7826, "slid_loss": 1.9078, "step": 58, "time": 77.28 }, { "epoch": 0.46, "learning_rate": "5.0000e-06", "loss": 1.8057, "slid_loss": 1.906, "step": 59, "time": 77.73 }, { "epoch": 0.47, "learning_rate": "5.0000e-06", "loss": 1.7842, "slid_loss": 1.904, "step": 60, "time": 80.25 }, { "epoch": 0.48, "learning_rate": "5.0000e-06", "loss": 1.8633, "slid_loss": 1.9033, "step": 61, "time": 83.66 }, { "epoch": 0.49, "learning_rate": "5.0000e-06", "loss": 1.7775, "slid_loss": 1.9013, "step": 62, "time": 83.44 }, { "epoch": 0.49, "learning_rate": "5.0000e-06", "loss": 1.7193, "slid_loss": 1.8984, "step": 63, "time": 87.14 }, { "epoch": 0.5, "learning_rate": "5.0000e-06", "loss": 1.7909, "slid_loss": 1.8967, "step": 64, "time": 83.19 }, { "epoch": 0.51, "learning_rate": "5.0000e-06", "loss": 1.7992, "slid_loss": 1.8952, "step": 65, "time": 80.02 }, { "epoch": 0.52, "learning_rate": "5.0000e-06", "loss": 1.8568, "slid_loss": 1.8947, "step": 66, "time": 82.08 }, { "epoch": 0.53, "learning_rate": "5.0000e-06", "loss": 1.8153, "slid_loss": 1.8935, "step": 67, "time": 81.18 }, { "epoch": 0.53, "learning_rate": "5.0000e-06", "loss": 1.8354, "slid_loss": 1.8926, "step": 68, "time": 80.32 }, { "epoch": 0.54, "learning_rate": "5.0000e-06", "loss": 1.8226, "slid_loss": 1.8916, "step": 69, "time": 80.34 }, { "epoch": 0.55, "learning_rate": "5.0000e-06", "loss": 1.7428, "slid_loss": 1.8895, "step": 70, "time": 81.95 }, { "epoch": 0.56, "learning_rate": "5.0000e-06", "loss": 1.7535, "slid_loss": 1.8876, "step": 71, "time": 79.71 }, { "epoch": 0.57, "learning_rate": "5.0000e-06", "loss": 1.7228, "slid_loss": 1.8853, "step": 72, "time": 81.39 }, { "epoch": 0.57, "learning_rate": "5.0000e-06", "loss": 1.768, "slid_loss": 1.8837, "step": 73, "time": 83.67 }, { "epoch": 0.58, "learning_rate": "5.0000e-06", "loss": 1.8065, "slid_loss": 1.8826, "step": 74, "time": 84.41 }, { "epoch": 0.59, "learning_rate": "5.0000e-06", "loss": 1.8008, "slid_loss": 1.8815, "step": 75, "time": 80.91 }, { "epoch": 0.6, "learning_rate": "5.0000e-06", "loss": 1.8525, "slid_loss": 1.8812, "step": 76, "time": 84.8 }, { "epoch": 0.6, "learning_rate": "5.0000e-06", "loss": 1.7307, "slid_loss": 1.8792, "step": 77, "time": 78.98 }, { "epoch": 0.61, "learning_rate": "5.0000e-06", "loss": 1.7338, "slid_loss": 1.8773, "step": 78, "time": 84.67 }, { "epoch": 0.62, "learning_rate": "5.0000e-06", "loss": 1.7968, "slid_loss": 1.8763, "step": 79, "time": 78.49 }, { "epoch": 0.63, "learning_rate": "5.0000e-06", "loss": 1.7787, "slid_loss": 1.8751, "step": 80, "time": 82.89 }, { "epoch": 0.64, "learning_rate": "5.0000e-06", "loss": 1.779, "slid_loss": 1.8739, "step": 81, "time": 78.23 }, { "epoch": 0.64, "learning_rate": "5.0000e-06", "loss": 1.7907, "slid_loss": 1.8729, "step": 82, "time": 81.05 }, { "epoch": 0.65, "learning_rate": "5.0000e-06", "loss": 1.7231, "slid_loss": 1.8711, "step": 83, "time": 79.99 }, { "epoch": 0.66, "learning_rate": "5.0000e-06", "loss": 1.7397, "slid_loss": 1.8695, "step": 84, "time": 79.98 }, { "epoch": 0.67, "learning_rate": "5.0000e-06", "loss": 1.7482, "slid_loss": 1.8681, "step": 85, "time": 79.09 }, { "epoch": 0.68, "learning_rate": "5.0000e-06", "loss": 1.7731, "slid_loss": 1.867, "step": 86, "time": 76.58 }, { "epoch": 0.68, "learning_rate": "5.0000e-06", "loss": 1.8358, "slid_loss": 1.8666, "step": 87, "time": 81.05 }, { "epoch": 0.69, "learning_rate": "5.0000e-06", "loss": 1.7569, "slid_loss": 1.8654, "step": 88, "time": 78.52 }, { "epoch": 0.7, "learning_rate": "5.0000e-06", "loss": 1.7772, "slid_loss": 1.8644, "step": 89, "time": 80.4 }, { "epoch": 0.71, "learning_rate": "5.0000e-06", "loss": 1.784, "slid_loss": 1.8635, "step": 90, "time": 86.12 }, { "epoch": 0.71, "learning_rate": "5.0000e-06", "loss": 1.7401, "slid_loss": 1.8621, "step": 91, "time": 80.56 }, { "epoch": 0.72, "learning_rate": "5.0000e-06", "loss": 1.7645, "slid_loss": 1.8611, "step": 92, "time": 76.77 }, { "epoch": 0.73, "learning_rate": "5.0000e-06", "loss": 1.7251, "slid_loss": 1.8596, "step": 93, "time": 80.72 }, { "epoch": 0.74, "learning_rate": "5.0000e-06", "loss": 1.792, "slid_loss": 1.8589, "step": 94, "time": 80.45 }, { "epoch": 0.75, "learning_rate": "5.0000e-06", "loss": 1.7834, "slid_loss": 1.8581, "step": 95, "time": 80.28 }, { "epoch": 0.75, "learning_rate": "5.0000e-06", "loss": 1.7851, "slid_loss": 1.8574, "step": 96, "time": 81.67 }, { "epoch": 0.76, "learning_rate": "5.0000e-06", "loss": 1.8255, "slid_loss": 1.857, "step": 97, "time": 77.73 }, { "epoch": 0.77, "learning_rate": "5.0000e-06", "loss": 1.8322, "slid_loss": 1.8568, "step": 98, "time": 79.82 }, { "epoch": 0.78, "learning_rate": "5.0000e-06", "loss": 1.7494, "slid_loss": 1.8557, "step": 99, "time": 83.73 }, { "epoch": 0.79, "learning_rate": "5.0000e-06", "loss": 1.8002, "slid_loss": 1.8551, "step": 100, "time": 80.04 }, { "epoch": 0.79, "learning_rate": "5.0000e-06", "loss": 1.6975, "slid_loss": 1.841, "step": 101, "time": 205.33 }, { "epoch": 0.8, "learning_rate": "5.0000e-06", "loss": 1.792, "slid_loss": 1.8306, "step": 102, "time": 86.16 }, { "epoch": 0.81, "learning_rate": "5.0000e-06", "loss": 1.8129, "slid_loss": 1.8201, "step": 103, "time": 80.84 }, { "epoch": 0.82, "learning_rate": "5.0000e-06", "loss": 1.7504, "slid_loss": 1.8121, "step": 104, "time": 79.26 }, { "epoch": 0.82, "learning_rate": "5.0000e-06", "loss": 1.688, "slid_loss": 1.8031, "step": 105, "time": 99.38 }, { "epoch": 0.83, "learning_rate": "5.0000e-06", "loss": 1.8118, "slid_loss": 1.8004, "step": 106, "time": 85.96 }, { "epoch": 0.84, "learning_rate": "5.0000e-06", "loss": 1.7048, "slid_loss": 1.7965, "step": 107, "time": 84.68 }, { "epoch": 0.85, "learning_rate": "5.0000e-06", "loss": 1.762, "slid_loss": 1.7949, "step": 108, "time": 79.12 }, { "epoch": 0.86, "learning_rate": "5.0000e-06", "loss": 1.799, "slid_loss": 1.7936, "step": 109, "time": 85.23 }, { "epoch": 0.86, "learning_rate": "5.0000e-06", "loss": 1.7582, "slid_loss": 1.793, "step": 110, "time": 85.47 }, { "epoch": 0.87, "learning_rate": "5.0000e-06", "loss": 1.7241, "slid_loss": 1.7919, "step": 111, "time": 81.73 }, { "epoch": 0.88, "learning_rate": "5.0000e-06", "loss": 1.7806, "slid_loss": 1.7917, "step": 112, "time": 85.27 }, { "epoch": 0.89, "learning_rate": "5.0000e-06", "loss": 1.7839, "slid_loss": 1.7909, "step": 113, "time": 78.94 }, { "epoch": 0.9, "learning_rate": "5.0000e-06", "loss": 1.789, "slid_loss": 1.7907, "step": 114, "time": 80.05 }, { "epoch": 0.9, "learning_rate": "5.0000e-06", "loss": 1.7831, "slid_loss": 1.7912, "step": 115, "time": 82.11 }, { "epoch": 0.91, "learning_rate": "5.0000e-06", "loss": 1.7619, "slid_loss": 1.7897, "step": 116, "time": 78.06 }, { "epoch": 0.92, "learning_rate": "5.0000e-06", "loss": 1.8384, "slid_loss": 1.7901, "step": 117, "time": 77.8 }, { "epoch": 0.93, "learning_rate": "5.0000e-06", "loss": 1.78, "slid_loss": 1.7904, "step": 118, "time": 81.19 }, { "epoch": 0.93, "learning_rate": "5.0000e-06", "loss": 1.7805, "slid_loss": 1.7901, "step": 119, "time": 83.57 }, { "epoch": 0.94, "learning_rate": "5.0000e-06", "loss": 1.7509, "slid_loss": 1.7899, "step": 120, "time": 78.54 }, { "epoch": 0.95, "learning_rate": "5.0000e-06", "loss": 1.7806, "slid_loss": 1.7897, "step": 121, "time": 83.97 }, { "epoch": 0.96, "learning_rate": "5.0000e-06", "loss": 1.7887, "slid_loss": 1.7899, "step": 122, "time": 83.74 }, { "epoch": 0.97, "learning_rate": "5.0000e-06", "loss": 1.7084, "slid_loss": 1.7893, "step": 123, "time": 78.85 }, { "epoch": 0.97, "learning_rate": "5.0000e-06", "loss": 1.7843, "slid_loss": 1.7889, "step": 124, "time": 79.91 }, { "epoch": 0.98, "learning_rate": "5.0000e-06", "loss": 1.8215, "slid_loss": 1.788, "step": 125, "time": 77.9 }, { "epoch": 0.99, "learning_rate": "5.0000e-06", "loss": 1.8352, "slid_loss": 1.7881, "step": 126, "time": 82.06 }, { "epoch": 1.0, "learning_rate": "5.0000e-06", "loss": 1.7802, "slid_loss": 1.7868, "step": 127, "time": 80.15 }, { "epoch": 1.0, "step": 127, "time": 0.01, "total_flos": 0.0, "train_loss": 1.837487073395196, "train_runtime": 10569.7638, "train_samples_per_second": 12.332, "train_steps_per_second": 0.012 } ], "logging_steps": 1.0, "max_steps": 127, "num_train_epochs": 1, "save_steps": 50, "total_flos": 0.0, "trial_name": null, "trial_params": null }