{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05952380952380952, "grad_norm": 11.633337718230726, "learning_rate": 2.631578947368421e-07, "loss": 0.8828, "step": 10 }, { "epoch": 0.11904761904761904, "grad_norm": 7.106262599818549, "learning_rate": 5.263157894736842e-07, "loss": 0.8087, "step": 20 }, { "epoch": 0.17857142857142858, "grad_norm": 2.0015322743055655, "learning_rate": 7.894736842105263e-07, "loss": 0.7295, "step": 30 }, { "epoch": 0.23809523809523808, "grad_norm": 1.4753694520182823, "learning_rate": 1.0526315789473683e-06, "loss": 0.6712, "step": 40 }, { "epoch": 0.2976190476190476, "grad_norm": 1.110295332689471, "learning_rate": 1.3157894736842106e-06, "loss": 0.6325, "step": 50 }, { "epoch": 0.35714285714285715, "grad_norm": 2.62423490064751, "learning_rate": 1.5789473684210526e-06, "loss": 0.6019, "step": 60 }, { "epoch": 0.4166666666666667, "grad_norm": 2.320671081832736, "learning_rate": 1.8421052631578946e-06, "loss": 0.5851, "step": 70 }, { "epoch": 0.47619047619047616, "grad_norm": 3.572874828764162, "learning_rate": 1.9996767546702485e-06, "loss": 0.5748, "step": 80 }, { "epoch": 0.5357142857142857, "grad_norm": 2.387950128798658, "learning_rate": 1.996043443883064e-06, "loss": 0.5678, "step": 90 }, { "epoch": 0.5952380952380952, "grad_norm": 2.3954023739686083, "learning_rate": 1.988392397752233e-06, "loss": 0.5575, "step": 100 }, { "epoch": 0.6547619047619048, "grad_norm": 2.9223208257498885, "learning_rate": 1.9767648201496052e-06, "loss": 0.5551, "step": 110 }, { "epoch": 0.7142857142857143, "grad_norm": 2.3303161266900063, "learning_rate": 1.961223330122206e-06, "loss": 0.5468, "step": 120 }, { "epoch": 0.7738095238095238, "grad_norm": 2.5697587476278088, "learning_rate": 1.941851624664209e-06, "loss": 0.5452, "step": 130 }, { "epoch": 0.8333333333333334, "grad_norm": 2.7124137926277583, "learning_rate": 1.9187540279759314e-06, "loss": 0.5381, "step": 140 }, { "epoch": 0.8928571428571429, "grad_norm": 2.6892276446954377, "learning_rate": 1.8920549296372686e-06, "loss": 0.5341, "step": 150 }, { "epoch": 0.9523809523809523, "grad_norm": 2.078964942505403, "learning_rate": 1.861898114721218e-06, "loss": 0.5294, "step": 160 }, { "epoch": 1.0, "eval_loss": 0.06584873795509338, "eval_runtime": 116.4553, "eval_samples_per_second": 155.45, "eval_steps_per_second": 0.61, "step": 168 }, { "epoch": 1.0119047619047619, "grad_norm": 1.8862054510010102, "learning_rate": 1.8284459894551025e-06, "loss": 0.5242, "step": 170 }, { "epoch": 1.0714285714285714, "grad_norm": 2.1399543444499467, "learning_rate": 1.7918787065996015e-06, "loss": 0.506, "step": 180 }, { "epoch": 1.130952380952381, "grad_norm": 1.9062388412547935, "learning_rate": 1.7523931952557666e-06, "loss": 0.5037, "step": 190 }, { "epoch": 1.1904761904761905, "grad_norm": 1.2708808678811239, "learning_rate": 1.7102021003248955e-06, "loss": 0.5017, "step": 200 }, { "epoch": 1.25, "grad_norm": 1.3542515524166345, "learning_rate": 1.6655326373326793e-06, "loss": 0.5023, "step": 210 }, { "epoch": 1.3095238095238095, "grad_norm": 2.1610871241580822, "learning_rate": 1.6186253687848507e-06, "loss": 0.497, "step": 220 }, { "epoch": 1.369047619047619, "grad_norm": 2.0822374960801397, "learning_rate": 1.569732908644127e-06, "loss": 0.4962, "step": 230 }, { "epoch": 1.4285714285714286, "grad_norm": 2.0776279888799416, "learning_rate": 1.5191185619053519e-06, "loss": 0.4943, "step": 240 }, { "epoch": 1.4880952380952381, "grad_norm": 1.991564712073659, "learning_rate": 1.4670549065952552e-06, "loss": 0.4903, "step": 250 }, { "epoch": 1.5476190476190477, "grad_norm": 2.037009128388502, "learning_rate": 1.4138223258333096e-06, "loss": 0.4885, "step": 260 }, { "epoch": 1.6071428571428572, "grad_norm": 1.820207258608856, "learning_rate": 1.3597074978591206e-06, "loss": 0.4864, "step": 270 }, { "epoch": 1.6666666666666665, "grad_norm": 2.1241388881877183, "learning_rate": 1.3050018521581279e-06, "loss": 0.4871, "step": 280 }, { "epoch": 1.7261904761904763, "grad_norm": 1.78922859395332, "learning_rate": 1.2499999999999999e-06, "loss": 0.4864, "step": 290 }, { "epoch": 1.7857142857142856, "grad_norm": 1.7328904839729484, "learning_rate": 1.1949981478418721e-06, "loss": 0.4813, "step": 300 }, { "epoch": 1.8452380952380953, "grad_norm": 1.935272666911019, "learning_rate": 1.1402925021408796e-06, "loss": 0.4801, "step": 310 }, { "epoch": 1.9047619047619047, "grad_norm": 1.2142950270281057, "learning_rate": 1.0861776741666901e-06, "loss": 0.4795, "step": 320 }, { "epoch": 1.9642857142857144, "grad_norm": 1.3120184387970604, "learning_rate": 1.032945093404745e-06, "loss": 0.4791, "step": 330 }, { "epoch": 2.0, "eval_loss": 0.061718959361314774, "eval_runtime": 116.2674, "eval_samples_per_second": 155.701, "eval_steps_per_second": 0.611, "step": 336 }, { "epoch": 2.0238095238095237, "grad_norm": 1.4955203361832747, "learning_rate": 9.80881438094648e-07, "loss": 0.4693, "step": 340 }, { "epoch": 2.0833333333333335, "grad_norm": 1.1141424267445637, "learning_rate": 9.302670913558731e-07, "loss": 0.4599, "step": 350 }, { "epoch": 2.142857142857143, "grad_norm": 0.8809215575567357, "learning_rate": 8.813746312151494e-07, "loss": 0.4552, "step": 360 }, { "epoch": 2.2023809523809526, "grad_norm": 0.819257933682719, "learning_rate": 8.344673626673205e-07, "loss": 0.4546, "step": 370 }, { "epoch": 2.261904761904762, "grad_norm": 0.9199338927704399, "learning_rate": 7.897978996751046e-07, "loss": 0.4574, "step": 380 }, { "epoch": 2.3214285714285716, "grad_norm": 0.9400263078511765, "learning_rate": 7.476068047442332e-07, "loss": 0.4527, "step": 390 }, { "epoch": 2.380952380952381, "grad_norm": 0.8120316882785873, "learning_rate": 7.081212934003984e-07, "loss": 0.4517, "step": 400 }, { "epoch": 2.4404761904761907, "grad_norm": 0.926431884810278, "learning_rate": 6.715540105448972e-07, "loss": 0.4507, "step": 410 }, { "epoch": 2.5, "grad_norm": 0.8615015270894472, "learning_rate": 6.381018852787821e-07, "loss": 0.4505, "step": 420 }, { "epoch": 2.5595238095238093, "grad_norm": 0.794846867305112, "learning_rate": 6.079450703627314e-07, "loss": 0.4519, "step": 430 }, { "epoch": 2.619047619047619, "grad_norm": 0.8659294487925853, "learning_rate": 5.812459720240681e-07, "loss": 0.4523, "step": 440 }, { "epoch": 2.678571428571429, "grad_norm": 0.8078214310845254, "learning_rate": 5.581483753357905e-07, "loss": 0.4498, "step": 450 }, { "epoch": 2.738095238095238, "grad_norm": 0.9616399301296082, "learning_rate": 5.387766698777935e-07, "loss": 0.451, "step": 460 }, { "epoch": 2.7976190476190474, "grad_norm": 0.8093607714101151, "learning_rate": 5.232351798503945e-07, "loss": 0.4495, "step": 470 }, { "epoch": 2.857142857142857, "grad_norm": 0.8508798798797214, "learning_rate": 5.116076022477671e-07, "loss": 0.4487, "step": 480 }, { "epoch": 2.9166666666666665, "grad_norm": 0.738511448943807, "learning_rate": 5.039565561169362e-07, "loss": 0.4454, "step": 490 }, { "epoch": 2.9761904761904763, "grad_norm": 0.7290876762277495, "learning_rate": 5.003232453297512e-07, "loss": 0.4473, "step": 500 }, { "epoch": 3.0, "eval_loss": 0.06018054857850075, "eval_runtime": 118.2486, "eval_samples_per_second": 153.093, "eval_steps_per_second": 0.6, "step": 504 }, { "epoch": 3.0, "step": 504, "total_flos": 3376037568184320.0, "train_loss": 0.518157976960379, "train_runtime": 17131.2553, "train_samples_per_second": 60.233, "train_steps_per_second": 0.029 } ], "logging_steps": 10, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3376037568184320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }