{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.946666666666665, "eval_steps": 500, "global_step": 3740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5333333333333333, "grad_norm": 0.26794755458831787, "learning_rate": 3e-05, "loss": 1.8512, "step": 100 }, { "epoch": 0.9973333333333333, "eval_accuracy": 0.6031838565022422, "eval_loss": 1.7184041738510132, "eval_runtime": 6.0677, "eval_samples_per_second": 82.403, "eval_steps_per_second": 10.383, "step": 187 }, { "epoch": 1.0666666666666667, "grad_norm": 0.37397125363349915, "learning_rate": 3e-05, "loss": 1.7523, "step": 200 }, { "epoch": 1.6, "grad_norm": 0.35164713859558105, "learning_rate": 3e-05, "loss": 1.7174, "step": 300 }, { "epoch": 2.0, "eval_accuracy": 0.604932735426009, "eval_loss": 1.70177161693573, "eval_runtime": 6.4641, "eval_samples_per_second": 77.351, "eval_steps_per_second": 9.746, "step": 375 }, { "epoch": 2.1333333333333333, "grad_norm": 0.3863197863101959, "learning_rate": 3e-05, "loss": 1.7212, "step": 400 }, { "epoch": 2.6666666666666665, "grad_norm": 0.4171089828014374, "learning_rate": 3e-05, "loss": 1.6805, "step": 500 }, { "epoch": 2.997333333333333, "eval_accuracy": 0.6061434977578475, "eval_loss": 1.693791151046753, "eval_runtime": 6.1109, "eval_samples_per_second": 81.821, "eval_steps_per_second": 10.309, "step": 562 }, { "epoch": 3.2, "grad_norm": 0.5697806477546692, "learning_rate": 3e-05, "loss": 1.6692, "step": 600 }, { "epoch": 3.7333333333333334, "grad_norm": 0.5351872444152832, "learning_rate": 3e-05, "loss": 1.6412, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.6067354260089686, "eval_loss": 1.6924760341644287, "eval_runtime": 6.7092, "eval_samples_per_second": 74.525, "eval_steps_per_second": 9.39, "step": 750 }, { "epoch": 4.266666666666667, "grad_norm": 0.6729754209518433, "learning_rate": 3e-05, "loss": 1.6073, "step": 800 }, { "epoch": 4.8, "grad_norm": 0.7071828246116638, "learning_rate": 3e-05, "loss": 1.5834, "step": 900 }, { "epoch": 4.997333333333334, "eval_accuracy": 0.6062242152466367, "eval_loss": 1.7046513557434082, "eval_runtime": 6.7276, "eval_samples_per_second": 74.32, "eval_steps_per_second": 9.364, "step": 937 }, { "epoch": 5.333333333333333, "grad_norm": 0.8920167684555054, "learning_rate": 3e-05, "loss": 1.5161, "step": 1000 }, { "epoch": 5.866666666666667, "grad_norm": 0.8955753445625305, "learning_rate": 3e-05, "loss": 1.5304, "step": 1100 }, { "epoch": 6.0, "eval_accuracy": 0.6056143497757848, "eval_loss": 1.723886251449585, "eval_runtime": 6.7785, "eval_samples_per_second": 73.763, "eval_steps_per_second": 9.294, "step": 1125 }, { "epoch": 6.4, "grad_norm": 0.9878254532814026, "learning_rate": 3e-05, "loss": 1.4575, "step": 1200 }, { "epoch": 6.933333333333334, "grad_norm": 1.1449097394943237, "learning_rate": 3e-05, "loss": 1.452, "step": 1300 }, { "epoch": 6.997333333333334, "eval_accuracy": 0.6039192825112107, "eval_loss": 1.7508341073989868, "eval_runtime": 6.0652, "eval_samples_per_second": 82.437, "eval_steps_per_second": 10.387, "step": 1312 }, { "epoch": 7.466666666666667, "grad_norm": 1.1436129808425903, "learning_rate": 3e-05, "loss": 1.3916, "step": 1400 }, { "epoch": 8.0, "grad_norm": 1.1151554584503174, "learning_rate": 3e-05, "loss": 1.3847, "step": 1500 }, { "epoch": 8.0, "eval_accuracy": 0.602762331838565, "eval_loss": 1.7711397409439087, "eval_runtime": 6.0614, "eval_samples_per_second": 82.489, "eval_steps_per_second": 10.394, "step": 1500 }, { "epoch": 8.533333333333333, "grad_norm": 1.3540856838226318, "learning_rate": 3e-05, "loss": 1.3177, "step": 1600 }, { "epoch": 8.997333333333334, "eval_accuracy": 0.6008699551569506, "eval_loss": 1.8049190044403076, "eval_runtime": 6.505, "eval_samples_per_second": 76.864, "eval_steps_per_second": 9.685, "step": 1687 }, { "epoch": 9.066666666666666, "grad_norm": 1.5734765529632568, "learning_rate": 3e-05, "loss": 1.326, "step": 1700 }, { "epoch": 9.6, "grad_norm": 1.6235650777816772, "learning_rate": 3e-05, "loss": 1.2747, "step": 1800 }, { "epoch": 10.0, "eval_accuracy": 0.599847533632287, "eval_loss": 1.8297590017318726, "eval_runtime": 6.3451, "eval_samples_per_second": 78.801, "eval_steps_per_second": 9.929, "step": 1875 }, { "epoch": 10.133333333333333, "grad_norm": 1.6720285415649414, "learning_rate": 3e-05, "loss": 1.2459, "step": 1900 }, { "epoch": 10.666666666666666, "grad_norm": 1.6803735494613647, "learning_rate": 3e-05, "loss": 1.2202, "step": 2000 }, { "epoch": 10.997333333333334, "eval_accuracy": 0.5980717488789238, "eval_loss": 1.881359338760376, "eval_runtime": 5.6855, "eval_samples_per_second": 87.942, "eval_steps_per_second": 11.081, "step": 2062 }, { "epoch": 11.2, "grad_norm": 1.7732399702072144, "learning_rate": 3e-05, "loss": 1.1909, "step": 2100 }, { "epoch": 11.733333333333333, "grad_norm": 1.6892499923706055, "learning_rate": 3e-05, "loss": 1.1589, "step": 2200 }, { "epoch": 12.0, "eval_accuracy": 0.5959372197309417, "eval_loss": 1.9310756921768188, "eval_runtime": 6.0847, "eval_samples_per_second": 82.173, "eval_steps_per_second": 10.354, "step": 2250 }, { "epoch": 12.266666666666667, "grad_norm": 1.7170026302337646, "learning_rate": 3e-05, "loss": 1.1364, "step": 2300 }, { "epoch": 12.8, "grad_norm": 1.7617255449295044, "learning_rate": 3e-05, "loss": 1.1231, "step": 2400 }, { "epoch": 12.997333333333334, "eval_accuracy": 0.5955336322869955, "eval_loss": 1.9429402351379395, "eval_runtime": 6.8196, "eval_samples_per_second": 73.318, "eval_steps_per_second": 9.238, "step": 2437 }, { "epoch": 13.333333333333334, "grad_norm": 2.0009663105010986, "learning_rate": 3e-05, "loss": 1.0823, "step": 2500 }, { "epoch": 13.866666666666667, "grad_norm": 2.044010877609253, "learning_rate": 3e-05, "loss": 1.0624, "step": 2600 }, { "epoch": 14.0, "eval_accuracy": 0.593255605381166, "eval_loss": 1.996883511543274, "eval_runtime": 6.0345, "eval_samples_per_second": 82.857, "eval_steps_per_second": 10.44, "step": 2625 }, { "epoch": 14.4, "grad_norm": 2.0219309329986572, "learning_rate": 3e-05, "loss": 1.0217, "step": 2700 }, { "epoch": 14.933333333333334, "grad_norm": 2.0420074462890625, "learning_rate": 3e-05, "loss": 1.0185, "step": 2800 }, { "epoch": 14.997333333333334, "eval_accuracy": 0.592170403587444, "eval_loss": 2.031921625137329, "eval_runtime": 5.9337, "eval_samples_per_second": 84.265, "eval_steps_per_second": 10.617, "step": 2812 }, { "epoch": 15.466666666666667, "grad_norm": 2.2069709300994873, "learning_rate": 3e-05, "loss": 0.9645, "step": 2900 }, { "epoch": 16.0, "grad_norm": 1.863767147064209, "learning_rate": 3e-05, "loss": 0.9718, "step": 3000 }, { "epoch": 16.0, "eval_accuracy": 0.5902959641255605, "eval_loss": 2.0798022747039795, "eval_runtime": 6.143, "eval_samples_per_second": 81.394, "eval_steps_per_second": 10.256, "step": 3000 }, { "epoch": 16.533333333333335, "grad_norm": 2.300877809524536, "learning_rate": 3e-05, "loss": 0.9101, "step": 3100 }, { "epoch": 16.997333333333334, "eval_accuracy": 0.5887354260089687, "eval_loss": 2.13960337638855, "eval_runtime": 6.0122, "eval_samples_per_second": 83.164, "eval_steps_per_second": 10.479, "step": 3187 }, { "epoch": 17.066666666666666, "grad_norm": 2.586768388748169, "learning_rate": 3e-05, "loss": 0.9136, "step": 3200 }, { "epoch": 17.6, "grad_norm": 2.3870368003845215, "learning_rate": 3e-05, "loss": 0.8606, "step": 3300 }, { "epoch": 18.0, "eval_accuracy": 0.5870134529147982, "eval_loss": 2.1882169246673584, "eval_runtime": 6.0214, "eval_samples_per_second": 83.036, "eval_steps_per_second": 10.463, "step": 3375 }, { "epoch": 18.133333333333333, "grad_norm": 2.6867830753326416, "learning_rate": 3e-05, "loss": 0.8724, "step": 3400 }, { "epoch": 18.666666666666668, "grad_norm": 2.807380199432373, "learning_rate": 3e-05, "loss": 0.8168, "step": 3500 }, { "epoch": 18.997333333333334, "eval_accuracy": 0.5862600896860987, "eval_loss": 2.2291159629821777, "eval_runtime": 6.3423, "eval_samples_per_second": 78.836, "eval_steps_per_second": 9.933, "step": 3562 }, { "epoch": 19.2, "grad_norm": 2.6379778385162354, "learning_rate": 3e-05, "loss": 0.8061, "step": 3600 }, { "epoch": 19.733333333333334, "grad_norm": 3.161210775375366, "learning_rate": 3e-05, "loss": 0.777, "step": 3700 }, { "epoch": 19.946666666666665, "eval_accuracy": 0.5851838565022421, "eval_loss": 2.2508349418640137, "eval_runtime": 5.6415, "eval_samples_per_second": 88.63, "eval_steps_per_second": 11.167, "step": 3740 }, { "epoch": 19.946666666666665, "step": 3740, "total_flos": 3.767212755417825e+17, "train_loss": 1.2655855785716663, "train_runtime": 8167.3443, "train_samples_per_second": 14.693, "train_steps_per_second": 0.458 } ], "logging_steps": 100, "max_steps": 3740, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 3.767212755417825e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }