{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.974948758824869, "global_step": 10950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23, "learning_rate": 4.553734061930783e-05, "loss": 1.3816, "step": 250 }, { "epoch": 0.23, "eval_loss": 0.1994573473930359, "eval_runtime": 49.3939, "eval_samples_per_second": 50.553, "eval_steps_per_second": 6.337, "step": 250 }, { "epoch": 0.46, "learning_rate": 9.107468123861566e-05, "loss": 0.1818, "step": 500 }, { "epoch": 0.46, "eval_loss": 0.18140748143196106, "eval_runtime": 49.3506, "eval_samples_per_second": 50.597, "eval_steps_per_second": 6.342, "step": 500 }, { "epoch": 0.68, "learning_rate": 9.807120237981e-05, "loss": 0.1716, "step": 750 }, { "epoch": 0.68, "eval_loss": 0.1712454855442047, "eval_runtime": 49.3274, "eval_samples_per_second": 50.621, "eval_steps_per_second": 6.345, "step": 750 }, { "epoch": 0.91, "learning_rate": 9.567220036464831e-05, "loss": 0.1501, "step": 1000 }, { "epoch": 0.91, "eval_loss": 0.1643093228340149, "eval_runtime": 49.3431, "eval_samples_per_second": 50.605, "eval_steps_per_second": 6.343, "step": 1000 }, { "epoch": 1.14, "learning_rate": 9.327319834948663e-05, "loss": 0.122, "step": 1250 }, { "epoch": 1.14, "eval_loss": 0.16473376750946045, "eval_runtime": 49.4257, "eval_samples_per_second": 50.52, "eval_steps_per_second": 6.333, "step": 1250 }, { "epoch": 1.37, "learning_rate": 9.087419633432492e-05, "loss": 0.1143, "step": 1500 }, { "epoch": 1.37, "eval_loss": 0.15297986567020416, "eval_runtime": 49.4063, "eval_samples_per_second": 50.54, "eval_steps_per_second": 6.335, "step": 1500 }, { "epoch": 1.59, "learning_rate": 8.847519431916324e-05, "loss": 0.1121, "step": 1750 }, { "epoch": 1.59, "eval_loss": 0.16007670760154724, "eval_runtime": 49.3369, "eval_samples_per_second": 50.611, "eval_steps_per_second": 6.344, "step": 1750 }, { "epoch": 1.82, "learning_rate": 8.607619230400153e-05, "loss": 0.1078, "step": 2000 }, { "epoch": 1.82, "eval_loss": 0.14469072222709656, "eval_runtime": 49.3439, "eval_samples_per_second": 50.604, "eval_steps_per_second": 6.343, "step": 2000 }, { "epoch": 2.05, "learning_rate": 8.367719028883985e-05, "loss": 0.0954, "step": 2250 }, { "epoch": 2.05, "eval_loss": 0.14710724353790283, "eval_runtime": 49.3536, "eval_samples_per_second": 50.594, "eval_steps_per_second": 6.342, "step": 2250 }, { "epoch": 2.28, "learning_rate": 8.127818827367816e-05, "loss": 0.0804, "step": 2500 }, { "epoch": 2.28, "eval_loss": 0.13685058057308197, "eval_runtime": 49.355, "eval_samples_per_second": 50.593, "eval_steps_per_second": 6.342, "step": 2500 }, { "epoch": 2.51, "learning_rate": 7.887918625851645e-05, "loss": 0.08, "step": 2750 }, { "epoch": 2.51, "eval_loss": 0.13740722835063934, "eval_runtime": 49.3607, "eval_samples_per_second": 50.587, "eval_steps_per_second": 6.341, "step": 2750 }, { "epoch": 2.73, "learning_rate": 7.648018424335477e-05, "loss": 0.0769, "step": 3000 }, { "epoch": 2.73, "eval_loss": 0.1370161473751068, "eval_runtime": 49.3701, "eval_samples_per_second": 50.577, "eval_steps_per_second": 6.34, "step": 3000 }, { "epoch": 2.96, "learning_rate": 7.408118222819308e-05, "loss": 0.0782, "step": 3250 }, { "epoch": 2.96, "eval_loss": 0.13374929130077362, "eval_runtime": 49.4074, "eval_samples_per_second": 50.539, "eval_steps_per_second": 6.335, "step": 3250 }, { "epoch": 3.19, "learning_rate": 7.168218021303138e-05, "loss": 0.0591, "step": 3500 }, { "epoch": 3.19, "eval_loss": 0.13687731325626373, "eval_runtime": 49.344, "eval_samples_per_second": 50.604, "eval_steps_per_second": 6.343, "step": 3500 }, { "epoch": 3.42, "learning_rate": 6.928317819786969e-05, "loss": 0.0575, "step": 3750 }, { "epoch": 3.42, "eval_loss": 0.13442662358283997, "eval_runtime": 49.347, "eval_samples_per_second": 50.601, "eval_steps_per_second": 6.343, "step": 3750 }, { "epoch": 3.64, "learning_rate": 6.6884176182708e-05, "loss": 0.0579, "step": 4000 }, { "epoch": 3.64, "eval_loss": 0.13463211059570312, "eval_runtime": 49.3532, "eval_samples_per_second": 50.595, "eval_steps_per_second": 6.342, "step": 4000 }, { "epoch": 3.87, "learning_rate": 6.44851741675463e-05, "loss": 0.0541, "step": 4250 }, { "epoch": 3.87, "eval_loss": 0.12762367725372314, "eval_runtime": 49.3131, "eval_samples_per_second": 50.636, "eval_steps_per_second": 6.347, "step": 4250 }, { "epoch": 4.1, "learning_rate": 6.208617215238462e-05, "loss": 0.0469, "step": 4500 }, { "epoch": 4.1, "eval_loss": 0.1321054846048355, "eval_runtime": 49.3382, "eval_samples_per_second": 50.61, "eval_steps_per_second": 6.344, "step": 4500 }, { "epoch": 4.33, "learning_rate": 5.968717013722291e-05, "loss": 0.0367, "step": 4750 }, { "epoch": 4.33, "eval_loss": 0.13329076766967773, "eval_runtime": 49.3411, "eval_samples_per_second": 50.607, "eval_steps_per_second": 6.344, "step": 4750 }, { "epoch": 4.55, "learning_rate": 5.7288168122061226e-05, "loss": 0.0409, "step": 5000 }, { "epoch": 4.55, "eval_loss": 0.13458645343780518, "eval_runtime": 49.3393, "eval_samples_per_second": 50.609, "eval_steps_per_second": 6.344, "step": 5000 }, { "epoch": 4.78, "learning_rate": 5.488916610689954e-05, "loss": 0.0402, "step": 5250 }, { "epoch": 4.78, "eval_loss": 0.12923233211040497, "eval_runtime": 49.4868, "eval_samples_per_second": 50.458, "eval_steps_per_second": 6.325, "step": 5250 }, { "epoch": 5.01, "learning_rate": 5.249016409173784e-05, "loss": 0.0378, "step": 5500 }, { "epoch": 5.01, "eval_loss": 0.12460647523403168, "eval_runtime": 49.3719, "eval_samples_per_second": 50.575, "eval_steps_per_second": 6.34, "step": 5500 }, { "epoch": 5.24, "learning_rate": 5.009116207657615e-05, "loss": 0.0258, "step": 5750 }, { "epoch": 5.24, "eval_loss": 0.1305789053440094, "eval_runtime": 49.3929, "eval_samples_per_second": 50.554, "eval_steps_per_second": 6.337, "step": 5750 }, { "epoch": 5.47, "learning_rate": 4.769216006141446e-05, "loss": 0.0252, "step": 6000 }, { "epoch": 5.47, "eval_loss": 0.13075487315654755, "eval_runtime": 49.3793, "eval_samples_per_second": 50.568, "eval_steps_per_second": 6.339, "step": 6000 }, { "epoch": 5.69, "learning_rate": 4.5293158046252756e-05, "loss": 0.0266, "step": 6250 }, { "epoch": 5.69, "eval_loss": 0.13152019679546356, "eval_runtime": 49.3918, "eval_samples_per_second": 50.555, "eval_steps_per_second": 6.337, "step": 6250 }, { "epoch": 5.92, "learning_rate": 4.289415603109107e-05, "loss": 0.0264, "step": 6500 }, { "epoch": 5.92, "eval_loss": 0.12978705763816833, "eval_runtime": 49.4158, "eval_samples_per_second": 50.53, "eval_steps_per_second": 6.334, "step": 6500 }, { "epoch": 6.15, "learning_rate": 4.0495154015929375e-05, "loss": 0.0204, "step": 6750 }, { "epoch": 6.15, "eval_loss": 0.1330789029598236, "eval_runtime": 49.4096, "eval_samples_per_second": 50.537, "eval_steps_per_second": 6.335, "step": 6750 }, { "epoch": 6.38, "learning_rate": 3.809615200076768e-05, "loss": 0.0176, "step": 7000 }, { "epoch": 6.38, "eval_loss": 0.13327623903751373, "eval_runtime": 49.4281, "eval_samples_per_second": 50.518, "eval_steps_per_second": 6.332, "step": 7000 }, { "epoch": 6.6, "learning_rate": 3.569714998560599e-05, "loss": 0.0177, "step": 7250 }, { "epoch": 6.6, "eval_loss": 0.13123974204063416, "eval_runtime": 49.4644, "eval_samples_per_second": 50.481, "eval_steps_per_second": 6.328, "step": 7250 }, { "epoch": 6.83, "learning_rate": 3.32981479704443e-05, "loss": 0.0161, "step": 7500 }, { "epoch": 6.83, "eval_loss": 0.1328614354133606, "eval_runtime": 49.4499, "eval_samples_per_second": 50.496, "eval_steps_per_second": 6.33, "step": 7500 }, { "epoch": 7.06, "learning_rate": 3.0899145955282606e-05, "loss": 0.016, "step": 7750 }, { "epoch": 7.06, "eval_loss": 0.13026753067970276, "eval_runtime": 49.4661, "eval_samples_per_second": 50.479, "eval_steps_per_second": 6.328, "step": 7750 }, { "epoch": 7.29, "learning_rate": 2.850014394012091e-05, "loss": 0.0104, "step": 8000 }, { "epoch": 7.29, "eval_loss": 0.13250969350337982, "eval_runtime": 49.5109, "eval_samples_per_second": 50.433, "eval_steps_per_second": 6.322, "step": 8000 }, { "epoch": 7.52, "learning_rate": 2.6101141924959215e-05, "loss": 0.0104, "step": 8250 }, { "epoch": 7.52, "eval_loss": 0.1344473958015442, "eval_runtime": 49.4545, "eval_samples_per_second": 50.491, "eval_steps_per_second": 6.329, "step": 8250 }, { "epoch": 7.74, "learning_rate": 2.3702139909797524e-05, "loss": 0.0107, "step": 8500 }, { "epoch": 7.74, "eval_loss": 0.13361412286758423, "eval_runtime": 49.4611, "eval_samples_per_second": 50.484, "eval_steps_per_second": 6.328, "step": 8500 }, { "epoch": 7.97, "learning_rate": 2.1303137894635834e-05, "loss": 0.0105, "step": 8750 }, { "epoch": 7.97, "eval_loss": 0.1311049610376358, "eval_runtime": 49.3899, "eval_samples_per_second": 50.557, "eval_steps_per_second": 6.337, "step": 8750 }, { "epoch": 8.2, "learning_rate": 1.890413587947414e-05, "loss": 0.0072, "step": 9000 }, { "epoch": 8.2, "eval_loss": 0.1345677375793457, "eval_runtime": 49.4945, "eval_samples_per_second": 50.45, "eval_steps_per_second": 6.324, "step": 9000 }, { "epoch": 8.43, "learning_rate": 1.6505133864312446e-05, "loss": 0.0065, "step": 9250 }, { "epoch": 8.43, "eval_loss": 0.13423801958560944, "eval_runtime": 49.4363, "eval_samples_per_second": 50.509, "eval_steps_per_second": 6.331, "step": 9250 }, { "epoch": 8.65, "learning_rate": 1.4106131849150753e-05, "loss": 0.0062, "step": 9500 }, { "epoch": 8.65, "eval_loss": 0.13279776275157928, "eval_runtime": 49.5198, "eval_samples_per_second": 50.424, "eval_steps_per_second": 6.321, "step": 9500 }, { "epoch": 8.88, "learning_rate": 1.1707129833989061e-05, "loss": 0.006, "step": 9750 }, { "epoch": 8.88, "eval_loss": 0.13258913159370422, "eval_runtime": 49.511, "eval_samples_per_second": 50.433, "eval_steps_per_second": 6.322, "step": 9750 }, { "epoch": 9.11, "learning_rate": 9.308127818827369e-06, "loss": 0.0052, "step": 10000 }, { "epoch": 9.11, "eval_loss": 0.13228829205036163, "eval_runtime": 49.4456, "eval_samples_per_second": 50.5, "eval_steps_per_second": 6.33, "step": 10000 }, { "epoch": 9.34, "learning_rate": 6.909125803665675e-06, "loss": 0.0039, "step": 10250 }, { "epoch": 9.34, "eval_loss": 0.13294672966003418, "eval_runtime": 49.4619, "eval_samples_per_second": 50.483, "eval_steps_per_second": 6.328, "step": 10250 }, { "epoch": 9.57, "learning_rate": 4.510123788503983e-06, "loss": 0.0039, "step": 10500 }, { "epoch": 9.57, "eval_loss": 0.13272705674171448, "eval_runtime": 49.4568, "eval_samples_per_second": 50.489, "eval_steps_per_second": 6.329, "step": 10500 }, { "epoch": 9.79, "learning_rate": 2.11112177334229e-06, "loss": 0.004, "step": 10750 }, { "epoch": 9.79, "eval_loss": 0.13220719993114471, "eval_runtime": 49.465, "eval_samples_per_second": 50.48, "eval_steps_per_second": 6.328, "step": 10750 } ], "max_steps": 10970, "num_train_epochs": 10, "total_flos": 8.710540889772442e+16, "trial_name": null, "trial_params": null }