{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 217, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.18852740291647, "learning_rate": 9.090909090909091e-06, "loss": 5.3084, "step": 1 }, { "epoch": 0.02, "grad_norm": 5.03901362055586, "learning_rate": 4.545454545454546e-05, "loss": 5.0728, "step": 5 }, { "epoch": 0.05, "grad_norm": 4.459588393324012, "learning_rate": 9.090909090909092e-05, "loss": 2.37, "step": 10 }, { "epoch": 0.07, "grad_norm": 0.43459848938578693, "learning_rate": 0.00013636363636363637, "loss": 0.1307, "step": 15 }, { "epoch": 0.09, "grad_norm": 0.0299929460934137, "learning_rate": 0.00018181818181818183, "loss": 0.0483, "step": 20 }, { "epoch": 0.12, "grad_norm": 0.013812266692594782, "learning_rate": 0.00019988322268323268, "loss": 0.0225, "step": 25 }, { "epoch": 0.14, "grad_norm": 0.009773102416496118, "learning_rate": 0.0001991705709821562, "loss": 0.0198, "step": 30 }, { "epoch": 0.16, "grad_norm": 0.0039386428485718465, "learning_rate": 0.00019781476007338058, "loss": 0.015, "step": 35 }, { "epoch": 0.18, "grad_norm": 0.002196082201771653, "learning_rate": 0.00019582458291091663, "loss": 0.0096, "step": 40 }, { "epoch": 0.21, "grad_norm": 0.018430687632200037, "learning_rate": 0.0001932129465573568, "loss": 0.0024, "step": 45 }, { "epoch": 0.23, "grad_norm": 0.9834813463855321, "learning_rate": 0.0001899967884766212, "loss": 0.0281, "step": 50 }, { "epoch": 0.25, "grad_norm": 0.005069811100933593, "learning_rate": 0.00018619696668800492, "loss": 0.0001, "step": 55 }, { "epoch": 0.28, "grad_norm": 0.007249668030582534, "learning_rate": 0.0001818381244939187, "loss": 0.0001, "step": 60 }, { "epoch": 0.3, "grad_norm": 0.0071458935911795545, "learning_rate": 0.00017694853065861662, "loss": 0.0001, "step": 65 }, { "epoch": 0.32, "grad_norm": 0.004757395926328272, "learning_rate": 0.00017155989607441213, "loss": 0.0001, "step": 70 }, { "epoch": 0.35, "grad_norm": 0.0027371110149472387, "learning_rate": 0.0001657071681043731, "loss": 0.0001, "step": 75 }, { "epoch": 0.37, "grad_norm": 0.0015303804705887707, "learning_rate": 0.00015942830393526176, "loss": 0.0, "step": 80 }, { "epoch": 0.39, "grad_norm": 0.0019029295128688208, "learning_rate": 0.0001527640244106133, "loss": 0.0191, "step": 85 }, { "epoch": 0.41, "grad_norm": 0.002913339239522618, "learning_rate": 0.00014575754994043956, "loss": 0.0, "step": 90 }, { "epoch": 0.44, "grad_norm": 0.002526914206171285, "learning_rate": 0.0001384543202002851, "loss": 0.0, "step": 95 }, { "epoch": 0.46, "grad_norm": 0.002364320350741189, "learning_rate": 0.00013090169943749476, "loss": 0.0, "step": 100 }, { "epoch": 0.48, "grad_norm": 0.001415040670977836, "learning_rate": 0.00012314866929589432, "loss": 0.0, "step": 105 }, { "epoch": 0.51, "grad_norm": 0.0014806660372118452, "learning_rate": 0.00011524551115103454, "loss": 0.0, "step": 110 }, { "epoch": 0.53, "grad_norm": 0.0009238152395960922, "learning_rate": 0.00010724348001617625, "loss": 0.0, "step": 115 }, { "epoch": 0.55, "grad_norm": 0.316858259539684, "learning_rate": 9.919447213386103e-05, "loss": 0.0175, "step": 120 }, { "epoch": 0.58, "grad_norm": 0.0013793498850558623, "learning_rate": 9.115068840886417e-05, "loss": 0.0, "step": 125 }, { "epoch": 0.6, "grad_norm": 0.03179793467382126, "learning_rate": 8.316429586529615e-05, "loss": 0.0, "step": 130 }, { "epoch": 0.62, "grad_norm": 0.008302429594092641, "learning_rate": 7.528708932343304e-05, "loss": 0.0104, "step": 135 }, { "epoch": 0.65, "grad_norm": 0.024284403402931108, "learning_rate": 6.757015549043175e-05, "loss": 0.0001, "step": 140 }, { "epoch": 0.67, "grad_norm": 0.007160721938093322, "learning_rate": 6.006354164343046e-05, "loss": 0.0001, "step": 145 }, { "epoch": 0.69, "grad_norm": 0.00442645759914904, "learning_rate": 5.28159310537518e-05, "loss": 0.0, "step": 150 }, { "epoch": 0.71, "grad_norm": 0.001965826516583593, "learning_rate": 4.587432725720687e-05, "loss": 0.0, "step": 155 }, { "epoch": 0.74, "grad_norm": 0.0012986955816234175, "learning_rate": 3.9283749218128885e-05, "loss": 0.0, "step": 160 }, { "epoch": 0.76, "grad_norm": 0.0010247882146462446, "learning_rate": 3.308693936411421e-05, "loss": 0.0, "step": 165 }, { "epoch": 0.78, "grad_norm": 0.000977980722589858, "learning_rate": 2.7324086384977698e-05, "loss": 0.0, "step": 170 }, { "epoch": 0.81, "grad_norm": 0.0008640860356404602, "learning_rate": 2.2032564593677774e-05, "loss": 0.0, "step": 175 }, { "epoch": 0.83, "grad_norm": 0.0007152715573634965, "learning_rate": 1.7246691539555028e-05, "loss": 0.0, "step": 180 }, { "epoch": 0.85, "grad_norm": 0.0008068934592673394, "learning_rate": 1.2997505445856084e-05, "loss": 0.0, "step": 185 }, { "epoch": 0.88, "grad_norm": 0.000785102458262206, "learning_rate": 9.31256391494546e-06, "loss": 0.0, "step": 190 }, { "epoch": 0.9, "grad_norm": 0.0006662070791456656, "learning_rate": 6.215765206679569e-06, "loss": 0.0, "step": 195 }, { "epoch": 0.92, "grad_norm": 0.0006676407776271578, "learning_rate": 3.7271932490209328e-06, "loss": 0.0, "step": 200 }, { "epoch": 0.94, "grad_norm": 0.0006248827839313129, "learning_rate": 1.8629873860586566e-06, "loss": 0.0, "step": 205 }, { "epoch": 0.97, "grad_norm": 0.0006849249794092451, "learning_rate": 6.352377081687011e-07, "loss": 0.0, "step": 210 }, { "epoch": 0.99, "grad_norm": 0.0007915981269703633, "learning_rate": 5.190664313851068e-08, "loss": 0.0056, "step": 215 }, { "epoch": 1.0, "eval_loss": NaN, "eval_runtime": 745.5269, "eval_samples_per_second": 1.551, "eval_steps_per_second": 0.388, "step": 217 }, { "epoch": 1.0, "step": 217, "total_flos": 1973257544663040.0, "train_loss": 0.0019402640651178403, "train_runtime": 4565.6051, "train_samples_per_second": 2.276, "train_steps_per_second": 0.048 } ], "logging_steps": 5, "max_steps": 217, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 40, "total_flos": 1973257544663040.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }