{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 2.072805643081665, "learning_rate": 4.995066821070679e-05, "loss": 1.1181, "step": 10 }, { "epoch": 0.08, "grad_norm": 2.6626899242401123, "learning_rate": 4.980286753286195e-05, "loss": 1.048, "step": 20 }, { "epoch": 0.12, "grad_norm": 3.8898932933807373, "learning_rate": 4.9557181268217227e-05, "loss": 1.0172, "step": 30 }, { "epoch": 0.16, "grad_norm": 2.715027093887329, "learning_rate": 4.9214579028215776e-05, "loss": 1.0621, "step": 40 }, { "epoch": 0.2, "grad_norm": 2.233584403991699, "learning_rate": 4.877641290737884e-05, "loss": 0.994, "step": 50 }, { "epoch": 0.24, "grad_norm": 2.1543967723846436, "learning_rate": 4.8244412147206284e-05, "loss": 1.0483, "step": 60 }, { "epoch": 0.28, "grad_norm": 2.3235280513763428, "learning_rate": 4.762067631165049e-05, "loss": 1.0077, "step": 70 }, { "epoch": 0.32, "grad_norm": 2.6280996799468994, "learning_rate": 4.690766700109659e-05, "loss": 0.9627, "step": 80 }, { "epoch": 0.36, "grad_norm": 2.4375860691070557, "learning_rate": 4.610819813755038e-05, "loss": 0.9753, "step": 90 }, { "epoch": 0.4, "grad_norm": 2.634675979614258, "learning_rate": 4.522542485937369e-05, "loss": 1.0424, "step": 100 }, { "epoch": 0.44, "grad_norm": 2.759512424468994, "learning_rate": 4.426283106939474e-05, "loss": 0.9185, "step": 110 }, { "epoch": 0.48, "grad_norm": 2.3730366230010986, "learning_rate": 4.3224215685535294e-05, "loss": 1.0032, "step": 120 }, { "epoch": 0.52, "grad_norm": 3.0645976066589355, "learning_rate": 4.211367764821722e-05, "loss": 0.995, "step": 130 }, { "epoch": 0.56, "grad_norm": 2.892836093902588, "learning_rate": 4.093559974371725e-05, "loss": 1.0167, "step": 140 }, { "epoch": 0.6, "grad_norm": 2.1179070472717285, "learning_rate": 3.969463130731183e-05, "loss": 0.9908, "step": 150 }, { "epoch": 0.64, "grad_norm": 2.6164073944091797, "learning_rate": 3.8395669874474915e-05, "loss": 0.9903, "step": 160 }, { "epoch": 0.68, "grad_norm": 2.0717203617095947, "learning_rate": 3.704384185254288e-05, "loss": 1.0031, "step": 170 }, { "epoch": 0.72, "grad_norm": 2.2452661991119385, "learning_rate": 3.564448228912682e-05, "loss": 0.9469, "step": 180 }, { "epoch": 0.76, "grad_norm": 2.4002890586853027, "learning_rate": 3.4203113817116957e-05, "loss": 0.9221, "step": 190 }, { "epoch": 0.8, "grad_norm": 2.4350810050964355, "learning_rate": 3.287466299138262e-05, "loss": 0.9588, "step": 200 }, { "epoch": 0.84, "grad_norm": 2.338216781616211, "learning_rate": 3.1369268142084556e-05, "loss": 0.9762, "step": 210 }, { "epoch": 0.88, "grad_norm": 2.716254472732544, "learning_rate": 2.9838736701271514e-05, "loss": 0.9577, "step": 220 }, { "epoch": 0.92, "grad_norm": 2.5938873291015625, "learning_rate": 2.8289108977307067e-05, "loss": 1.0061, "step": 230 }, { "epoch": 0.96, "grad_norm": 2.575774669647217, "learning_rate": 2.6726500642860154e-05, "loss": 1.0507, "step": 240 }, { "epoch": 1.0, "grad_norm": 2.5186479091644287, "learning_rate": 2.5157078599138977e-05, "loss": 0.951, "step": 250 }, { "epoch": 1.04, "grad_norm": 2.128981590270996, "learning_rate": 2.3587036637949388e-05, "loss": 0.8208, "step": 260 }, { "epoch": 1.08, "grad_norm": 2.1169350147247314, "learning_rate": 2.2022570997628256e-05, "loss": 0.7953, "step": 270 }, { "epoch": 1.12, "grad_norm": 2.3190295696258545, "learning_rate": 2.0469855909321564e-05, "loss": 0.7427, "step": 280 }, { "epoch": 1.16, "grad_norm": 2.6951375007629395, "learning_rate": 1.908752507440689e-05, "loss": 0.763, "step": 290 }, { "epoch": 1.2, "grad_norm": 2.6969361305236816, "learning_rate": 1.7573960460574133e-05, "loss": 0.7914, "step": 300 }, { "epoch": 1.24, "grad_norm": 2.5656094551086426, "learning_rate": 1.6089703032168733e-05, "loss": 0.7915, "step": 310 }, { "epoch": 1.28, "grad_norm": 3.075153112411499, "learning_rate": 1.4640610475167898e-05, "loss": 0.785, "step": 320 }, { "epoch": 1.32, "grad_norm": 3.3679215908050537, "learning_rate": 1.3232401695866687e-05, "loss": 0.7762, "step": 330 }, { "epoch": 1.36, "grad_norm": 3.499119281768799, "learning_rate": 1.1870634250967605e-05, "loss": 0.8098, "step": 340 }, { "epoch": 1.4, "grad_norm": 2.9737751483917236, "learning_rate": 1.0560682414443315e-05, "loss": 0.7372, "step": 350 }, { "epoch": 1.44, "grad_norm": 3.3601698875427246, "learning_rate": 9.307715967732491e-06, "loss": 0.8166, "step": 360 }, { "epoch": 1.48, "grad_norm": 3.3548429012298584, "learning_rate": 8.116679796974388e-06, "loss": 0.7472, "step": 370 }, { "epoch": 1.52, "grad_norm": 2.8706016540527344, "learning_rate": 6.992274377802327e-06, "loss": 0.7355, "step": 380 }, { "epoch": 1.56, "grad_norm": 2.9783272743225098, "learning_rate": 5.9389372247138e-06, "loss": 0.7639, "step": 390 }, { "epoch": 1.6, "grad_norm": 3.8318257331848145, "learning_rate": 4.960825378228082e-06, "loss": 0.8115, "step": 400 }, { "epoch": 1.64, "grad_norm": 3.5943961143493652, "learning_rate": 4.061798998946459e-06, "loss": 0.7761, "step": 410 }, { "epoch": 1.68, "grad_norm": 3.3045473098754883, "learning_rate": 3.245406133261858e-06, "loss": 0.805, "step": 420 }, { "epoch": 1.72, "grad_norm": 2.879966974258423, "learning_rate": 2.514868710840723e-06, "loss": 0.7729, "step": 430 }, { "epoch": 1.76, "grad_norm": 4.045947551727295, "learning_rate": 1.8730698291385518e-06, "loss": 0.7135, "step": 440 }, { "epoch": 1.8, "grad_norm": 3.1562774181365967, "learning_rate": 1.3225423751313942e-06, "loss": 0.827, "step": 450 }, { "epoch": 1.84, "grad_norm": 3.1504297256469727, "learning_rate": 8.65459029168153e-07, "loss": 0.7455, "step": 460 }, { "epoch": 1.88, "grad_norm": 3.496607780456543, "learning_rate": 5.036236903938285e-07, "loss": 0.8043, "step": 470 }, { "epoch": 1.92, "grad_norm": 3.2006499767303467, "learning_rate": 2.384643575837203e-07, "loss": 0.8145, "step": 480 }, { "epoch": 1.96, "grad_norm": 3.356722116470337, "learning_rate": 7.102749348465165e-08, "loss": 0.7569, "step": 490 }, { "epoch": 2.0, "grad_norm": 3.4808757305145264, "learning_rate": 1.973894904597207e-09, "loss": 0.7338, "step": 500 }, { "epoch": 2.0, "step": 500, "total_flos": 3.216214367225774e+17, "train_loss": 0.888002516746521, "train_runtime": 10329.3048, "train_samples_per_second": 0.387, "train_steps_per_second": 0.048 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 3.216214367225774e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }