{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9101251422070534, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.26715654134750366, "learning_rate": 4.9992855064046754e-05, "loss": 2.6697, "step": 5 }, { "epoch": 0.05, "grad_norm": 0.4067687392234802, "learning_rate": 4.997142434019578e-05, "loss": 2.5369, "step": 10 }, { "epoch": 0.07, "grad_norm": 0.44472697377204895, "learning_rate": 4.9935720078139045e-05, "loss": 2.5661, "step": 15 }, { "epoch": 0.09, "grad_norm": 0.45230674743652344, "learning_rate": 4.988576268624979e-05, "loss": 2.4824, "step": 20 }, { "epoch": 0.11, "grad_norm": 0.4477585554122925, "learning_rate": 4.982158071991725e-05, "loss": 2.3343, "step": 25 }, { "epoch": 0.14, "grad_norm": 0.3630908131599426, "learning_rate": 4.974321086522453e-05, "loss": 2.4377, "step": 30 }, { "epoch": 0.16, "grad_norm": 0.3237389028072357, "learning_rate": 4.9650697917979025e-05, "loss": 2.4114, "step": 35 }, { "epoch": 0.18, "grad_norm": 0.3014233708381653, "learning_rate": 4.954409475810737e-05, "loss": 2.2636, "step": 40 }, { "epoch": 0.2, "grad_norm": 0.3054388761520386, "learning_rate": 4.942346231942955e-05, "loss": 2.2758, "step": 45 }, { "epoch": 0.23, "grad_norm": 0.2940792143344879, "learning_rate": 4.92888695548294e-05, "loss": 2.3207, "step": 50 }, { "epoch": 0.25, "grad_norm": 0.25064757466316223, "learning_rate": 4.9140393396841565e-05, "loss": 2.2209, "step": 55 }, { "epoch": 0.27, "grad_norm": 0.3227023780345917, "learning_rate": 4.89781187136772e-05, "loss": 2.2328, "step": 60 }, { "epoch": 0.3, "grad_norm": 0.312673419713974, "learning_rate": 4.880213826071375e-05, "loss": 2.2737, "step": 65 }, { "epoch": 0.32, "grad_norm": 0.26930904388427734, "learning_rate": 4.861255262747643e-05, "loss": 2.2686, "step": 70 }, { "epoch": 0.34, "grad_norm": 0.2343609631061554, "learning_rate": 4.8409470180141827e-05, "loss": 2.2661, "step": 75 }, { "epoch": 0.36, "grad_norm": 0.3088403642177582, "learning_rate": 4.8193006999596294e-05, "loss": 2.191, "step": 80 }, { "epoch": 0.39, "grad_norm": 0.3336387574672699, "learning_rate": 4.796328681508473e-05, "loss": 2.2106, "step": 85 }, { "epoch": 0.41, "grad_norm": 0.28899675607681274, "learning_rate": 4.7720440933487575e-05, "loss": 2.2347, "step": 90 }, { "epoch": 0.43, "grad_norm": 0.30223432183265686, "learning_rate": 4.746460816426647e-05, "loss": 2.2307, "step": 95 }, { "epoch": 0.46, "grad_norm": 0.2692021429538727, "learning_rate": 4.7195934740121485e-05, "loss": 2.1503, "step": 100 }, { "epoch": 0.48, "grad_norm": 0.3296695053577423, "learning_rate": 4.6914574233405236e-05, "loss": 2.2145, "step": 105 }, { "epoch": 0.5, "grad_norm": 0.28714171051979065, "learning_rate": 4.662068746834176e-05, "loss": 2.1163, "step": 110 }, { "epoch": 0.52, "grad_norm": 0.35001716017723083, "learning_rate": 4.6314442429100155e-05, "loss": 2.1868, "step": 115 }, { "epoch": 0.55, "grad_norm": 0.32721835374832153, "learning_rate": 4.599601416377575e-05, "loss": 2.1865, "step": 120 }, { "epoch": 0.57, "grad_norm": 0.30708980560302734, "learning_rate": 4.566558468433344e-05, "loss": 2.2035, "step": 125 }, { "epoch": 0.59, "grad_norm": 0.3122975528240204, "learning_rate": 4.532334286257064e-05, "loss": 2.1762, "step": 130 }, { "epoch": 0.61, "grad_norm": 0.3531278669834137, "learning_rate": 4.496948432215913e-05, "loss": 2.2452, "step": 135 }, { "epoch": 0.64, "grad_norm": 0.3190854489803314, "learning_rate": 4.460421132682751e-05, "loss": 2.2267, "step": 140 }, { "epoch": 0.66, "grad_norm": 0.29605475068092346, "learning_rate": 4.4227732664748365e-05, "loss": 2.2548, "step": 145 }, { "epoch": 0.68, "grad_norm": 0.37863361835479736, "learning_rate": 4.384026352919595e-05, "loss": 2.2053, "step": 150 }, { "epoch": 0.71, "grad_norm": 0.3235403895378113, "learning_rate": 4.344202539554285e-05, "loss": 2.193, "step": 155 }, { "epoch": 0.73, "grad_norm": 0.3326057195663452, "learning_rate": 4.3033245894665814e-05, "loss": 2.2349, "step": 160 }, { "epoch": 0.75, "grad_norm": 0.3827252686023712, "learning_rate": 4.261415868283304e-05, "loss": 2.1247, "step": 165 }, { "epoch": 0.77, "grad_norm": 0.34777992963790894, "learning_rate": 4.218500330814753e-05, "loss": 2.1555, "step": 170 }, { "epoch": 0.8, "grad_norm": 0.3294101059436798, "learning_rate": 4.174602507362258e-05, "loss": 2.1771, "step": 175 }, { "epoch": 0.82, "grad_norm": 0.3454132676124573, "learning_rate": 4.1297474896967814e-05, "loss": 2.1616, "step": 180 }, { "epoch": 0.84, "grad_norm": 0.3628969192504883, "learning_rate": 4.083960916716597e-05, "loss": 2.1681, "step": 185 }, { "epoch": 0.86, "grad_norm": 0.31756460666656494, "learning_rate": 4.0372689597922215e-05, "loss": 2.146, "step": 190 }, { "epoch": 0.89, "grad_norm": 0.41087692975997925, "learning_rate": 3.989698307806995e-05, "loss": 2.2185, "step": 195 }, { "epoch": 0.91, "grad_norm": 0.3311336636543274, "learning_rate": 3.941276151901853e-05, "loss": 2.0976, "step": 200 } ], "logging_steps": 5, "max_steps": 657, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 6.836880289234944e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }