{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1497, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 1.2907918691635132, "learning_rate": 3.2000000000000005e-05, "loss": 1.3493, "step": 25 }, { "epoch": 0.1, "grad_norm": 1.2323806285858154, "learning_rate": 6.533333333333334e-05, "loss": 1.1043, "step": 50 }, { "epoch": 0.15, "grad_norm": 1.096836805343628, "learning_rate": 9.866666666666668e-05, "loss": 0.9079, "step": 75 }, { "epoch": 0.2, "grad_norm": 0.9220419526100159, "learning_rate": 0.000132, "loss": 0.8502, "step": 100 }, { "epoch": 0.25, "grad_norm": 0.9188351631164551, "learning_rate": 0.00016533333333333333, "loss": 0.7941, "step": 125 }, { "epoch": 0.3, "grad_norm": 0.9083553552627563, "learning_rate": 0.00019866666666666668, "loss": 0.7949, "step": 150 }, { "epoch": 0.35, "grad_norm": 0.7864977717399597, "learning_rate": 0.00019643652561247217, "loss": 0.7836, "step": 175 }, { "epoch": 0.4, "grad_norm": 0.8785583972930908, "learning_rate": 0.00019272457312546402, "loss": 0.775, "step": 200 }, { "epoch": 0.45, "grad_norm": 0.7661826014518738, "learning_rate": 0.00018901262063845584, "loss": 0.7749, "step": 225 }, { "epoch": 0.5, "grad_norm": 0.8582839965820312, "learning_rate": 0.00018530066815144767, "loss": 0.7408, "step": 250 }, { "epoch": 0.55, "grad_norm": 0.8379924297332764, "learning_rate": 0.00018158871566443952, "loss": 0.7563, "step": 275 }, { "epoch": 0.6, "grad_norm": 0.8311147093772888, "learning_rate": 0.00017787676317743134, "loss": 0.7645, "step": 300 }, { "epoch": 0.65, "grad_norm": 0.8674755096435547, "learning_rate": 0.00017416481069042317, "loss": 0.7558, "step": 325 }, { "epoch": 0.7, "grad_norm": 0.725708544254303, "learning_rate": 0.00017045285820341502, "loss": 0.7442, "step": 350 }, { "epoch": 0.75, "grad_norm": 0.676539421081543, "learning_rate": 0.00016674090571640684, "loss": 0.758, "step": 375 }, { "epoch": 0.8, "grad_norm": 0.7118335962295532, "learning_rate": 0.00016302895322939867, "loss": 0.7181, "step": 400 }, { "epoch": 0.85, "grad_norm": 0.7514967918395996, "learning_rate": 0.00015931700074239052, "loss": 0.7415, "step": 425 }, { "epoch": 0.9, "grad_norm": 0.6775885224342346, "learning_rate": 0.00015560504825538234, "loss": 0.7106, "step": 450 }, { "epoch": 0.95, "grad_norm": 0.7105979323387146, "learning_rate": 0.0001518930957683742, "loss": 0.7078, "step": 475 }, { "epoch": 1.0, "grad_norm": 0.6819420456886292, "learning_rate": 0.00014818114328136602, "loss": 0.726, "step": 500 }, { "epoch": 1.05, "grad_norm": 0.7699077129364014, "learning_rate": 0.00014446919079435784, "loss": 0.6885, "step": 525 }, { "epoch": 1.1, "grad_norm": 0.6890037059783936, "learning_rate": 0.0001407572383073497, "loss": 0.6874, "step": 550 }, { "epoch": 1.15, "grad_norm": 0.7747482657432556, "learning_rate": 0.00013704528582034152, "loss": 0.6822, "step": 575 }, { "epoch": 1.2, "grad_norm": 0.7736285328865051, "learning_rate": 0.00013333333333333334, "loss": 0.6905, "step": 600 }, { "epoch": 1.25, "grad_norm": 1.0000784397125244, "learning_rate": 0.0001296213808463252, "loss": 0.6844, "step": 625 }, { "epoch": 1.3, "grad_norm": 0.6599183082580566, "learning_rate": 0.00012590942835931702, "loss": 0.6783, "step": 650 }, { "epoch": 1.35, "grad_norm": 0.7472581267356873, "learning_rate": 0.00012219747587230884, "loss": 0.6976, "step": 675 }, { "epoch": 1.4, "grad_norm": 0.7868366241455078, "learning_rate": 0.00011848552338530068, "loss": 0.687, "step": 700 }, { "epoch": 1.45, "grad_norm": 0.7280272841453552, "learning_rate": 0.0001147735708982925, "loss": 0.6795, "step": 725 }, { "epoch": 1.5, "grad_norm": 0.7689390182495117, "learning_rate": 0.00011106161841128435, "loss": 0.6667, "step": 750 }, { "epoch": 1.55, "grad_norm": 0.7489085793495178, "learning_rate": 0.00010734966592427618, "loss": 0.668, "step": 775 }, { "epoch": 1.6, "grad_norm": 0.7128411531448364, "learning_rate": 0.000103637713437268, "loss": 0.7063, "step": 800 }, { "epoch": 1.65, "grad_norm": 0.7407069802284241, "learning_rate": 9.992576095025984e-05, "loss": 0.6566, "step": 825 }, { "epoch": 1.7, "grad_norm": 0.7692396640777588, "learning_rate": 9.621380846325168e-05, "loss": 0.6735, "step": 850 }, { "epoch": 1.75, "grad_norm": 0.7389003038406372, "learning_rate": 9.250185597624351e-05, "loss": 0.6641, "step": 875 }, { "epoch": 1.8, "grad_norm": 0.8007437586784363, "learning_rate": 8.878990348923534e-05, "loss": 0.6426, "step": 900 }, { "epoch": 1.85, "grad_norm": 0.762878954410553, "learning_rate": 8.507795100222718e-05, "loss": 0.7014, "step": 925 }, { "epoch": 1.9, "grad_norm": 0.685552179813385, "learning_rate": 8.136599851521901e-05, "loss": 0.6667, "step": 950 }, { "epoch": 1.95, "grad_norm": 0.7692657113075256, "learning_rate": 7.765404602821085e-05, "loss": 0.6695, "step": 975 }, { "epoch": 2.0, "grad_norm": 0.758256733417511, "learning_rate": 7.394209354120267e-05, "loss": 0.6859, "step": 1000 }, { "epoch": 2.05, "grad_norm": 0.7138740420341492, "learning_rate": 7.023014105419451e-05, "loss": 0.6221, "step": 1025 }, { "epoch": 2.1, "grad_norm": 0.781160831451416, "learning_rate": 6.651818856718635e-05, "loss": 0.6256, "step": 1050 }, { "epoch": 2.15, "grad_norm": 0.7872413992881775, "learning_rate": 6.280623608017817e-05, "loss": 0.6126, "step": 1075 }, { "epoch": 2.2, "grad_norm": 0.8612145185470581, "learning_rate": 5.9094283593170005e-05, "loss": 0.5982, "step": 1100 }, { "epoch": 2.25, "grad_norm": 0.8253654837608337, "learning_rate": 5.538233110616184e-05, "loss": 0.5968, "step": 1125 }, { "epoch": 2.3, "grad_norm": 0.7710873484611511, "learning_rate": 5.167037861915368e-05, "loss": 0.628, "step": 1150 }, { "epoch": 2.35, "grad_norm": 0.8175609111785889, "learning_rate": 4.795842613214551e-05, "loss": 0.6342, "step": 1175 }, { "epoch": 2.4, "grad_norm": 0.8151463270187378, "learning_rate": 4.424647364513734e-05, "loss": 0.6434, "step": 1200 }, { "epoch": 2.45, "grad_norm": 0.8666600584983826, "learning_rate": 4.053452115812918e-05, "loss": 0.6046, "step": 1225 }, { "epoch": 2.51, "grad_norm": 0.9800847768783569, "learning_rate": 3.682256867112101e-05, "loss": 0.6037, "step": 1250 }, { "epoch": 2.56, "grad_norm": 0.8552590608596802, "learning_rate": 3.311061618411285e-05, "loss": 0.6284, "step": 1275 }, { "epoch": 2.61, "grad_norm": 0.8620779514312744, "learning_rate": 2.939866369710468e-05, "loss": 0.636, "step": 1300 }, { "epoch": 2.66, "grad_norm": 0.8995267748832703, "learning_rate": 2.5686711210096513e-05, "loss": 0.614, "step": 1325 }, { "epoch": 2.71, "grad_norm": 0.7862620949745178, "learning_rate": 2.1974758723088344e-05, "loss": 0.6499, "step": 1350 }, { "epoch": 2.76, "grad_norm": 0.8439193964004517, "learning_rate": 1.826280623608018e-05, "loss": 0.6094, "step": 1375 }, { "epoch": 2.81, "grad_norm": 0.8704841136932373, "learning_rate": 1.4550853749072013e-05, "loss": 0.6121, "step": 1400 }, { "epoch": 2.86, "grad_norm": 0.8250275254249573, "learning_rate": 1.0838901262063845e-05, "loss": 0.6127, "step": 1425 }, { "epoch": 2.91, "grad_norm": 0.7634843587875366, "learning_rate": 7.12694877505568e-06, "loss": 0.6159, "step": 1450 }, { "epoch": 2.96, "grad_norm": 0.829576313495636, "learning_rate": 3.414996288047513e-06, "loss": 0.5931, "step": 1475 } ], "logging_steps": 25, "max_steps": 1497, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.6185387210702848e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }