{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.882352941176471, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11, "learning_rate": 9.820788530465951e-05, "loss": 2.039, "step": 10 }, { "epoch": 0.11, "eval_accuracy": 0.15425531566143036, "eval_loss": 1.9874085187911987, "eval_runtime": 18.0154, "eval_samples_per_second": 10.436, "eval_steps_per_second": 2.609, "step": 10 }, { "epoch": 0.21, "learning_rate": 9.6415770609319e-05, "loss": 1.8674, "step": 20 }, { "epoch": 0.21, "eval_accuracy": 0.18617020547389984, "eval_loss": 1.915345549583435, "eval_runtime": 19.3568, "eval_samples_per_second": 9.712, "eval_steps_per_second": 2.428, "step": 20 }, { "epoch": 0.32, "learning_rate": 9.46236559139785e-05, "loss": 1.9047, "step": 30 }, { "epoch": 0.32, "eval_accuracy": 0.21808511018753052, "eval_loss": 1.8614839315414429, "eval_runtime": 16.987, "eval_samples_per_second": 11.067, "eval_steps_per_second": 2.767, "step": 30 }, { "epoch": 0.43, "learning_rate": 9.2831541218638e-05, "loss": 1.8554, "step": 40 }, { "epoch": 0.43, "eval_accuracy": 0.3297872245311737, "eval_loss": 1.7530335187911987, "eval_runtime": 17.9287, "eval_samples_per_second": 10.486, "eval_steps_per_second": 2.621, "step": 40 }, { "epoch": 0.53, "learning_rate": 9.10394265232975e-05, "loss": 1.7546, "step": 50 }, { "epoch": 0.53, "eval_accuracy": 0.34574466943740845, "eval_loss": 1.693456768989563, "eval_runtime": 17.7507, "eval_samples_per_second": 10.591, "eval_steps_per_second": 2.648, "step": 50 }, { "epoch": 0.64, "learning_rate": 8.924731182795699e-05, "loss": 1.7583, "step": 60 }, { "epoch": 0.64, "eval_accuracy": 0.31382977962493896, "eval_loss": 1.7744777202606201, "eval_runtime": 17.5758, "eval_samples_per_second": 10.697, "eval_steps_per_second": 2.674, "step": 60 }, { "epoch": 0.75, "learning_rate": 8.74551971326165e-05, "loss": 1.6846, "step": 70 }, { "epoch": 0.75, "eval_accuracy": 0.47340425848960876, "eval_loss": 1.383662462234497, "eval_runtime": 17.9239, "eval_samples_per_second": 10.489, "eval_steps_per_second": 2.622, "step": 70 }, { "epoch": 0.86, "learning_rate": 8.566308243727598e-05, "loss": 1.335, "step": 80 }, { "epoch": 0.86, "eval_accuracy": 0.4095744788646698, "eval_loss": 1.3189808130264282, "eval_runtime": 17.9015, "eval_samples_per_second": 10.502, "eval_steps_per_second": 2.625, "step": 80 }, { "epoch": 0.96, "learning_rate": 8.387096774193549e-05, "loss": 1.3937, "step": 90 }, { "epoch": 0.96, "eval_accuracy": 0.5, "eval_loss": 1.2728934288024902, "eval_runtime": 16.8503, "eval_samples_per_second": 11.157, "eval_steps_per_second": 2.789, "step": 90 }, { "epoch": 1.07, "learning_rate": 8.207885304659499e-05, "loss": 1.3213, "step": 100 }, { "epoch": 1.07, "eval_accuracy": 0.5319148898124695, "eval_loss": 1.348612666130066, "eval_runtime": 17.9939, "eval_samples_per_second": 10.448, "eval_steps_per_second": 2.612, "step": 100 }, { "epoch": 1.18, "learning_rate": 8.028673835125448e-05, "loss": 1.2577, "step": 110 }, { "epoch": 1.18, "eval_accuracy": 0.43617022037506104, "eval_loss": 1.4858437776565552, "eval_runtime": 16.7284, "eval_samples_per_second": 11.238, "eval_steps_per_second": 2.81, "step": 110 }, { "epoch": 1.28, "learning_rate": 7.849462365591398e-05, "loss": 1.3039, "step": 120 }, { "epoch": 1.28, "eval_accuracy": 0.5372340679168701, "eval_loss": 1.1272777318954468, "eval_runtime": 18.0338, "eval_samples_per_second": 10.425, "eval_steps_per_second": 2.606, "step": 120 }, { "epoch": 1.39, "learning_rate": 7.670250896057349e-05, "loss": 1.2018, "step": 130 }, { "epoch": 1.39, "eval_accuracy": 0.6010638475418091, "eval_loss": 1.07073974609375, "eval_runtime": 18.3403, "eval_samples_per_second": 10.251, "eval_steps_per_second": 2.563, "step": 130 }, { "epoch": 1.5, "learning_rate": 7.491039426523297e-05, "loss": 1.0523, "step": 140 }, { "epoch": 1.5, "eval_accuracy": 0.6223404407501221, "eval_loss": 0.9273392558097839, "eval_runtime": 17.4553, "eval_samples_per_second": 10.77, "eval_steps_per_second": 2.693, "step": 140 }, { "epoch": 1.6, "learning_rate": 7.311827956989248e-05, "loss": 0.8089, "step": 150 }, { "epoch": 1.6, "eval_accuracy": 0.6436170339584351, "eval_loss": 0.958331823348999, "eval_runtime": 18.1909, "eval_samples_per_second": 10.335, "eval_steps_per_second": 2.584, "step": 150 }, { "epoch": 1.71, "learning_rate": 7.132616487455197e-05, "loss": 0.86, "step": 160 }, { "epoch": 1.71, "eval_accuracy": 0.728723406791687, "eval_loss": 0.7838733792304993, "eval_runtime": 17.986, "eval_samples_per_second": 10.453, "eval_steps_per_second": 2.613, "step": 160 }, { "epoch": 1.82, "learning_rate": 6.953405017921147e-05, "loss": 0.7991, "step": 170 }, { "epoch": 1.82, "eval_accuracy": 0.7393617033958435, "eval_loss": 0.7742288708686829, "eval_runtime": 17.5516, "eval_samples_per_second": 10.711, "eval_steps_per_second": 2.678, "step": 170 }, { "epoch": 1.93, "learning_rate": 6.774193548387096e-05, "loss": 0.6772, "step": 180 }, { "epoch": 1.93, "eval_accuracy": 0.728723406791687, "eval_loss": 0.7831820845603943, "eval_runtime": 17.1247, "eval_samples_per_second": 10.978, "eval_steps_per_second": 2.745, "step": 180 }, { "epoch": 2.03, "learning_rate": 6.594982078853047e-05, "loss": 0.5716, "step": 190 }, { "epoch": 2.03, "eval_accuracy": 0.6808510422706604, "eval_loss": 0.8607780933380127, "eval_runtime": 17.9663, "eval_samples_per_second": 10.464, "eval_steps_per_second": 2.616, "step": 190 }, { "epoch": 2.14, "learning_rate": 6.415770609318996e-05, "loss": 0.7228, "step": 200 }, { "epoch": 2.14, "eval_accuracy": 0.7127659320831299, "eval_loss": 0.9357430338859558, "eval_runtime": 16.801, "eval_samples_per_second": 11.19, "eval_steps_per_second": 2.797, "step": 200 }, { "epoch": 2.25, "learning_rate": 6.236559139784946e-05, "loss": 0.5451, "step": 210 }, { "epoch": 2.25, "eval_accuracy": 0.6489361524581909, "eval_loss": 1.018636703491211, "eval_runtime": 17.1948, "eval_samples_per_second": 10.934, "eval_steps_per_second": 2.733, "step": 210 }, { "epoch": 2.35, "learning_rate": 6.057347670250897e-05, "loss": 0.6429, "step": 220 }, { "epoch": 2.35, "eval_accuracy": 0.6968085169792175, "eval_loss": 0.917243242263794, "eval_runtime": 17.5045, "eval_samples_per_second": 10.74, "eval_steps_per_second": 2.685, "step": 220 }, { "epoch": 2.46, "learning_rate": 5.878136200716846e-05, "loss": 0.8224, "step": 230 }, { "epoch": 2.46, "eval_accuracy": 0.6329787373542786, "eval_loss": 1.0460094213485718, "eval_runtime": 16.9483, "eval_samples_per_second": 11.093, "eval_steps_per_second": 2.773, "step": 230 }, { "epoch": 2.57, "learning_rate": 5.6989247311827965e-05, "loss": 0.6061, "step": 240 }, { "epoch": 2.57, "eval_accuracy": 0.7446808218955994, "eval_loss": 0.8167480230331421, "eval_runtime": 16.9079, "eval_samples_per_second": 11.119, "eval_steps_per_second": 2.78, "step": 240 }, { "epoch": 2.67, "learning_rate": 5.519713261648746e-05, "loss": 0.3594, "step": 250 }, { "epoch": 2.67, "eval_accuracy": 0.7234042286872864, "eval_loss": 0.8162137866020203, "eval_runtime": 16.7777, "eval_samples_per_second": 11.205, "eval_steps_per_second": 2.801, "step": 250 }, { "epoch": 2.78, "learning_rate": 5.340501792114696e-05, "loss": 0.4122, "step": 260 }, { "epoch": 2.78, "eval_accuracy": 0.7819148898124695, "eval_loss": 0.7674133777618408, "eval_runtime": 16.8556, "eval_samples_per_second": 11.154, "eval_steps_per_second": 2.788, "step": 260 }, { "epoch": 2.89, "learning_rate": 5.161290322580645e-05, "loss": 0.2874, "step": 270 }, { "epoch": 2.89, "eval_accuracy": 0.7978723645210266, "eval_loss": 0.6874329447746277, "eval_runtime": 18.1768, "eval_samples_per_second": 10.343, "eval_steps_per_second": 2.586, "step": 270 }, { "epoch": 2.99, "learning_rate": 4.982078853046595e-05, "loss": 0.4881, "step": 280 }, { "epoch": 2.99, "eval_accuracy": 0.7553191781044006, "eval_loss": 0.7479570508003235, "eval_runtime": 17.7772, "eval_samples_per_second": 10.575, "eval_steps_per_second": 2.644, "step": 280 }, { "epoch": 3.1, "learning_rate": 4.802867383512545e-05, "loss": 0.5232, "step": 290 }, { "epoch": 3.1, "eval_accuracy": 0.7659574747085571, "eval_loss": 0.7912107706069946, "eval_runtime": 17.9423, "eval_samples_per_second": 10.478, "eval_steps_per_second": 2.62, "step": 290 }, { "epoch": 3.21, "learning_rate": 4.6236559139784944e-05, "loss": 0.1914, "step": 300 }, { "epoch": 3.21, "eval_accuracy": 0.7978723645210266, "eval_loss": 0.7023528218269348, "eval_runtime": 18.317, "eval_samples_per_second": 10.264, "eval_steps_per_second": 2.566, "step": 300 }, { "epoch": 3.32, "learning_rate": 4.4444444444444447e-05, "loss": 0.2573, "step": 310 }, { "epoch": 3.32, "eval_accuracy": 0.7659574747085571, "eval_loss": 0.7678730487823486, "eval_runtime": 16.9343, "eval_samples_per_second": 11.102, "eval_steps_per_second": 2.775, "step": 310 }, { "epoch": 3.42, "learning_rate": 4.265232974910394e-05, "loss": 0.1959, "step": 320 }, { "epoch": 3.42, "eval_accuracy": 0.7819148898124695, "eval_loss": 0.7615554332733154, "eval_runtime": 17.2171, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.73, "step": 320 }, { "epoch": 3.53, "learning_rate": 4.0860215053763444e-05, "loss": 0.2317, "step": 330 }, { "epoch": 3.53, "eval_accuracy": 0.8191489577293396, "eval_loss": 0.6963978409767151, "eval_runtime": 18.8086, "eval_samples_per_second": 9.995, "eval_steps_per_second": 2.499, "step": 330 }, { "epoch": 3.64, "learning_rate": 3.906810035842295e-05, "loss": 0.2674, "step": 340 }, { "epoch": 3.64, "eval_accuracy": 0.813829779624939, "eval_loss": 0.6807590126991272, "eval_runtime": 18.085, "eval_samples_per_second": 10.395, "eval_steps_per_second": 2.599, "step": 340 }, { "epoch": 3.74, "learning_rate": 3.727598566308244e-05, "loss": 0.1163, "step": 350 }, { "epoch": 3.74, "eval_accuracy": 0.8297872543334961, "eval_loss": 0.6285024881362915, "eval_runtime": 18.2735, "eval_samples_per_second": 10.288, "eval_steps_per_second": 2.572, "step": 350 }, { "epoch": 3.85, "learning_rate": 3.548387096774194e-05, "loss": 0.1601, "step": 360 }, { "epoch": 3.85, "eval_accuracy": 0.813829779624939, "eval_loss": 0.68441241979599, "eval_runtime": 18.2649, "eval_samples_per_second": 10.293, "eval_steps_per_second": 2.573, "step": 360 }, { "epoch": 3.96, "learning_rate": 3.369175627240144e-05, "loss": 0.2855, "step": 370 }, { "epoch": 3.96, "eval_accuracy": 0.792553186416626, "eval_loss": 0.7759273052215576, "eval_runtime": 16.9436, "eval_samples_per_second": 11.096, "eval_steps_per_second": 2.774, "step": 370 }, { "epoch": 4.06, "learning_rate": 3.1899641577060935e-05, "loss": 0.2195, "step": 380 }, { "epoch": 4.06, "eval_accuracy": 0.835106372833252, "eval_loss": 0.6165522336959839, "eval_runtime": 16.8992, "eval_samples_per_second": 11.125, "eval_steps_per_second": 2.781, "step": 380 }, { "epoch": 4.17, "learning_rate": 3.010752688172043e-05, "loss": 0.2313, "step": 390 }, { "epoch": 4.17, "eval_accuracy": 0.8457446694374084, "eval_loss": 0.5867934823036194, "eval_runtime": 16.9793, "eval_samples_per_second": 11.072, "eval_steps_per_second": 2.768, "step": 390 }, { "epoch": 4.28, "learning_rate": 2.831541218637993e-05, "loss": 0.0568, "step": 400 }, { "epoch": 4.28, "eval_accuracy": 0.8510638475418091, "eval_loss": 0.5872617959976196, "eval_runtime": 17.0099, "eval_samples_per_second": 11.052, "eval_steps_per_second": 2.763, "step": 400 }, { "epoch": 4.39, "learning_rate": 2.652329749103943e-05, "loss": 0.1141, "step": 410 }, { "epoch": 4.39, "eval_accuracy": 0.8563829660415649, "eval_loss": 0.5599851012229919, "eval_runtime": 17.0, "eval_samples_per_second": 11.059, "eval_steps_per_second": 2.765, "step": 410 }, { "epoch": 4.49, "learning_rate": 2.4731182795698928e-05, "loss": 0.1607, "step": 420 }, { "epoch": 4.49, "eval_accuracy": 0.8297872543334961, "eval_loss": 0.6353866457939148, "eval_runtime": 17.4021, "eval_samples_per_second": 10.803, "eval_steps_per_second": 2.701, "step": 420 }, { "epoch": 4.6, "learning_rate": 2.2939068100358423e-05, "loss": 0.0994, "step": 430 }, { "epoch": 4.6, "eval_accuracy": 0.8670212626457214, "eval_loss": 0.5707226991653442, "eval_runtime": 19.3599, "eval_samples_per_second": 9.711, "eval_steps_per_second": 2.428, "step": 430 }, { "epoch": 4.71, "learning_rate": 2.1146953405017922e-05, "loss": 0.125, "step": 440 }, { "epoch": 4.71, "eval_accuracy": 0.8510638475418091, "eval_loss": 0.6016649007797241, "eval_runtime": 18.1587, "eval_samples_per_second": 10.353, "eval_steps_per_second": 2.588, "step": 440 }, { "epoch": 4.81, "learning_rate": 1.935483870967742e-05, "loss": 0.1598, "step": 450 }, { "epoch": 4.81, "eval_accuracy": 0.8670212626457214, "eval_loss": 0.5569073557853699, "eval_runtime": 17.221, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.729, "step": 450 }, { "epoch": 4.92, "learning_rate": 1.7562724014336916e-05, "loss": 0.1274, "step": 460 }, { "epoch": 4.92, "eval_accuracy": 0.8670212626457214, "eval_loss": 0.5385098457336426, "eval_runtime": 17.4593, "eval_samples_per_second": 10.768, "eval_steps_per_second": 2.692, "step": 460 }, { "epoch": 5.03, "learning_rate": 1.5770609318996415e-05, "loss": 0.131, "step": 470 }, { "epoch": 5.03, "eval_accuracy": 0.8563829660415649, "eval_loss": 0.5594031810760498, "eval_runtime": 17.5202, "eval_samples_per_second": 10.73, "eval_steps_per_second": 2.683, "step": 470 }, { "epoch": 5.13, "learning_rate": 1.3978494623655914e-05, "loss": 0.0754, "step": 480 }, { "epoch": 5.13, "eval_accuracy": 0.8563829660415649, "eval_loss": 0.5571564435958862, "eval_runtime": 18.3965, "eval_samples_per_second": 10.219, "eval_steps_per_second": 2.555, "step": 480 }, { "epoch": 5.24, "learning_rate": 1.2186379928315413e-05, "loss": 0.0298, "step": 490 }, { "epoch": 5.24, "eval_accuracy": 0.8776595592498779, "eval_loss": 0.5584980845451355, "eval_runtime": 18.2497, "eval_samples_per_second": 10.302, "eval_steps_per_second": 2.575, "step": 490 }, { "epoch": 5.35, "learning_rate": 1.039426523297491e-05, "loss": 0.012, "step": 500 }, { "epoch": 5.35, "eval_accuracy": 0.8723404407501221, "eval_loss": 0.5599969625473022, "eval_runtime": 17.3404, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.71, "step": 500 }, { "epoch": 5.45, "learning_rate": 8.602150537634409e-06, "loss": 0.178, "step": 510 }, { "epoch": 5.45, "eval_accuracy": 0.8723404407501221, "eval_loss": 0.5632631778717041, "eval_runtime": 17.983, "eval_samples_per_second": 10.454, "eval_steps_per_second": 2.614, "step": 510 }, { "epoch": 5.56, "learning_rate": 6.810035842293908e-06, "loss": 0.0365, "step": 520 }, { "epoch": 5.56, "eval_accuracy": 0.8670212626457214, "eval_loss": 0.5653506517410278, "eval_runtime": 18.484, "eval_samples_per_second": 10.171, "eval_steps_per_second": 2.543, "step": 520 }, { "epoch": 5.67, "learning_rate": 5.017921146953405e-06, "loss": 0.0958, "step": 530 }, { "epoch": 5.67, "eval_accuracy": 0.8723404407501221, "eval_loss": 0.5490068197250366, "eval_runtime": 16.8627, "eval_samples_per_second": 11.149, "eval_steps_per_second": 2.787, "step": 530 }, { "epoch": 5.78, "learning_rate": 3.225806451612903e-06, "loss": 0.0525, "step": 540 }, { "epoch": 5.78, "eval_accuracy": 0.8776595592498779, "eval_loss": 0.542866051197052, "eval_runtime": 18.0905, "eval_samples_per_second": 10.392, "eval_steps_per_second": 2.598, "step": 540 }, { "epoch": 5.88, "learning_rate": 1.4336917562724014e-06, "loss": 0.0144, "step": 550 }, { "epoch": 5.88, "eval_accuracy": 0.8829787373542786, "eval_loss": 0.5416831970214844, "eval_runtime": 17.9022, "eval_samples_per_second": 10.502, "eval_steps_per_second": 2.625, "step": 550 } ], "logging_steps": 10, "max_steps": 558, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 10, "total_flos": 7.083294890110614e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }