{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9985652797704447, "eval_steps": 500, "global_step": 261, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.45229363441467285, "learning_rate": 9.990947518281311e-05, "loss": 1.2293, "step": 5 }, { "epoch": 0.04, "grad_norm": 0.45895835757255554, "learning_rate": 9.963822852095345e-05, "loss": 1.0332, "step": 10 }, { "epoch": 0.06, "grad_norm": 1.148818850517273, "learning_rate": 9.918724219660013e-05, "loss": 1.1688, "step": 15 }, { "epoch": 0.08, "grad_norm": 0.6193984150886536, "learning_rate": 9.855814922793582e-05, "loss": 0.8491, "step": 20 }, { "epoch": 0.1, "grad_norm": 0.5973692536354065, "learning_rate": 9.775322755599978e-05, "loss": 0.8356, "step": 25 }, { "epoch": 0.11, "grad_norm": 0.471483439207077, "learning_rate": 9.677539179628005e-05, "loss": 0.782, "step": 30 }, { "epoch": 0.13, "grad_norm": 0.5395711660385132, "learning_rate": 9.562818268491216e-05, "loss": 0.7464, "step": 35 }, { "epoch": 0.15, "grad_norm": 0.7068768739700317, "learning_rate": 9.431575425769938e-05, "loss": 0.6813, "step": 40 }, { "epoch": 0.17, "grad_norm": 0.7348042130470276, "learning_rate": 9.284285880837946e-05, "loss": 0.6508, "step": 45 }, { "epoch": 0.19, "grad_norm": 1.763695478439331, "learning_rate": 9.121482968060384e-05, "loss": 0.7296, "step": 50 }, { "epoch": 0.21, "grad_norm": 0.7689581513404846, "learning_rate": 8.943756195593916e-05, "loss": 0.604, "step": 55 }, { "epoch": 0.23, "grad_norm": 1.1951208114624023, "learning_rate": 8.751749110782012e-05, "loss": 0.594, "step": 60 }, { "epoch": 0.25, "grad_norm": 0.9435452818870544, "learning_rate": 8.546156969874723e-05, "loss": 0.6778, "step": 65 }, { "epoch": 0.27, "grad_norm": 0.8245531320571899, "learning_rate": 8.327724220510873e-05, "loss": 0.6173, "step": 70 }, { "epoch": 0.29, "grad_norm": 0.6063089966773987, "learning_rate": 8.097241806078615e-05, "loss": 0.7239, "step": 75 }, { "epoch": 0.31, "grad_norm": 0.4883978068828583, "learning_rate": 7.855544301715203e-05, "loss": 0.5158, "step": 80 }, { "epoch": 0.33, "grad_norm": 0.6936870217323303, "learning_rate": 7.603506892316512e-05, "loss": 0.5011, "step": 85 }, { "epoch": 0.34, "grad_norm": 2.553333282470703, "learning_rate": 7.342042203498951e-05, "loss": 0.6461, "step": 90 }, { "epoch": 0.36, "grad_norm": 0.9251458644866943, "learning_rate": 7.07209699698876e-05, "loss": 0.7273, "step": 95 }, { "epoch": 0.38, "grad_norm": 0.699193000793457, "learning_rate": 6.79464874240473e-05, "loss": 0.5878, "step": 100 }, { "epoch": 0.4, "grad_norm": 0.661618173122406, "learning_rate": 6.510702077847863e-05, "loss": 0.4787, "step": 105 }, { "epoch": 0.42, "grad_norm": 0.4883117079734802, "learning_rate": 6.221285172114157e-05, "loss": 0.5766, "step": 110 }, { "epoch": 0.44, "grad_norm": 2.715449810028076, "learning_rate": 5.927446001702899e-05, "loss": 0.5553, "step": 115 }, { "epoch": 0.46, "grad_norm": 0.6433199644088745, "learning_rate": 5.6302485561014475e-05, "loss": 0.7007, "step": 120 }, { "epoch": 0.48, "grad_norm": 1.7128715515136719, "learning_rate": 5.330768985087059e-05, "loss": 0.5773, "step": 125 }, { "epoch": 0.5, "grad_norm": 0.7177631258964539, "learning_rate": 5.030091701996428e-05, "loss": 0.5787, "step": 130 }, { "epoch": 0.52, "grad_norm": 0.6795344948768616, "learning_rate": 4.729305457072913e-05, "loss": 0.5013, "step": 135 }, { "epoch": 0.54, "grad_norm": 1.1840591430664062, "learning_rate": 4.429499395109877e-05, "loss": 0.6419, "step": 140 }, { "epoch": 0.55, "grad_norm": 0.7961875200271606, "learning_rate": 4.131759111665349e-05, "loss": 0.4529, "step": 145 }, { "epoch": 0.57, "grad_norm": 0.7156918048858643, "learning_rate": 3.8371627221284495e-05, "loss": 0.5831, "step": 150 }, { "epoch": 0.59, "grad_norm": 0.8424770832061768, "learning_rate": 3.546776957871445e-05, "loss": 0.6044, "step": 155 }, { "epoch": 0.61, "grad_norm": 0.9914979338645935, "learning_rate": 3.261653303623263e-05, "loss": 0.5824, "step": 160 }, { "epoch": 0.63, "grad_norm": 1.0051041841506958, "learning_rate": 2.982824190050958e-05, "loss": 0.4595, "step": 165 }, { "epoch": 0.65, "grad_norm": 1.2977936267852783, "learning_rate": 2.711299255335833e-05, "loss": 0.544, "step": 170 }, { "epoch": 0.67, "grad_norm": 0.5247394442558289, "learning_rate": 2.4480616892809594e-05, "loss": 0.5458, "step": 175 }, { "epoch": 0.69, "grad_norm": 0.9446833729743958, "learning_rate": 2.194064673188089e-05, "loss": 0.5625, "step": 180 }, { "epoch": 0.71, "grad_norm": 0.9111084938049316, "learning_rate": 1.9502279283951364e-05, "loss": 0.5163, "step": 185 }, { "epoch": 0.73, "grad_norm": 0.9301843643188477, "learning_rate": 1.7174343859719333e-05, "loss": 0.5594, "step": 190 }, { "epoch": 0.75, "grad_norm": 1.5420268774032593, "learning_rate": 1.4965269896332885e-05, "loss": 0.4737, "step": 195 }, { "epoch": 0.77, "grad_norm": 0.866782546043396, "learning_rate": 1.2883056434459506e-05, "loss": 0.5539, "step": 200 }, { "epoch": 0.78, "grad_norm": 1.928578495979309, "learning_rate": 1.0935243153818436e-05, "loss": 0.5426, "step": 205 }, { "epoch": 0.8, "grad_norm": 0.5703736543655396, "learning_rate": 9.12888307205541e-06, "loss": 0.5781, "step": 210 }, { "epoch": 0.82, "grad_norm": 1.3220263719558716, "learning_rate": 7.470517005817474e-06, "loss": 0.6523, "step": 215 }, { "epoch": 0.84, "grad_norm": 4.24189567565918, "learning_rate": 5.966149886503614e-06, "loss": 0.5199, "step": 220 }, { "epoch": 0.86, "grad_norm": 0.606140673160553, "learning_rate": 4.621229016452156e-06, "loss": 0.487, "step": 225 }, { "epoch": 0.88, "grad_norm": 0.7255881428718567, "learning_rate": 3.4406243442987764e-06, "loss": 0.5654, "step": 230 }, { "epoch": 0.9, "grad_norm": 0.7342286705970764, "learning_rate": 2.428610830928152e-06, "loss": 0.5211, "step": 235 }, { "epoch": 0.92, "grad_norm": 1.0628710985183716, "learning_rate": 1.5888529698718346e-06, "loss": 0.5965, "step": 240 }, { "epoch": 0.94, "grad_norm": 0.6802533268928528, "learning_rate": 9.243915182039431e-07, "loss": 0.542, "step": 245 }, { "epoch": 0.96, "grad_norm": 2.282472610473633, "learning_rate": 4.376324859820924e-07, "loss": 0.5677, "step": 250 }, { "epoch": 0.98, "grad_norm": 2.717949628829956, "learning_rate": 1.3033842410251075e-07, "loss": 0.4775, "step": 255 }, { "epoch": 0.99, "grad_norm": 1.769280195236206, "learning_rate": 3.622042116169233e-09, "loss": 0.6862, "step": 260 }, { "epoch": 1.0, "step": 261, "total_flos": 3.809983450683802e+16, "train_loss": 0.6279083029063726, "train_runtime": 203702.1714, "train_samples_per_second": 0.01, "train_steps_per_second": 0.001 } ], "logging_steps": 5, "max_steps": 261, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 3.809983450683802e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }