{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 3, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 0.14388315379619598, "learning_rate": 1.0000000000000001e-07, "loss": 1.8335, "step": 1 }, { "epoch": 0.1, "eval_loss": 2.0722551345825195, "eval_runtime": 16.7825, "eval_samples_per_second": 1.013, "eval_steps_per_second": 0.179, "step": 1 }, { "epoch": 0.2, "grad_norm": 0.14784696698188782, "learning_rate": 2.0000000000000002e-07, "loss": 1.9186, "step": 2 }, { "epoch": 0.3, "grad_norm": 0.14353208243846893, "learning_rate": 3.0000000000000004e-07, "loss": 1.7855, "step": 3 }, { "epoch": 0.3, "eval_loss": 2.0736846923828125, "eval_runtime": 16.8339, "eval_samples_per_second": 1.01, "eval_steps_per_second": 0.178, "step": 3 }, { "epoch": 0.4, "grad_norm": 0.1807236522436142, "learning_rate": 4.0000000000000003e-07, "loss": 1.8872, "step": 4 }, { "epoch": 0.5, "grad_norm": 0.16707605123519897, "learning_rate": 5.000000000000001e-07, "loss": 1.7567, "step": 5 }, { "epoch": 0.6, "grad_norm": 0.14522354304790497, "learning_rate": 6.000000000000001e-07, "loss": 1.8449, "step": 6 }, { "epoch": 0.6, "eval_loss": 2.0734410285949707, "eval_runtime": 16.8439, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.178, "step": 6 }, { "epoch": 0.7, "grad_norm": 0.202021062374115, "learning_rate": 7.000000000000001e-07, "loss": 1.9643, "step": 7 }, { "epoch": 0.8, "grad_norm": 0.19137771427631378, "learning_rate": 8.000000000000001e-07, "loss": 1.8088, "step": 8 }, { "epoch": 0.9, "grad_norm": 0.16487151384353638, "learning_rate": 9.000000000000001e-07, "loss": 1.99, "step": 9 }, { "epoch": 0.9, "eval_loss": 2.0727851390838623, "eval_runtime": 16.8683, "eval_samples_per_second": 1.008, "eval_steps_per_second": 0.178, "step": 9 }, { "epoch": 1.0, "grad_norm": 0.15699583292007446, "learning_rate": 1.0000000000000002e-06, "loss": 1.7899, "step": 10 }, { "epoch": 1.1, "grad_norm": 0.19337183237075806, "learning_rate": 1.1e-06, "loss": 2.0322, "step": 11 }, { "epoch": 1.2, "grad_norm": 0.1443641483783722, "learning_rate": 1.2000000000000002e-06, "loss": 1.7771, "step": 12 }, { "epoch": 1.2, "eval_loss": 2.0730879306793213, "eval_runtime": 16.8475, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.178, "step": 12 }, { "epoch": 1.3, "grad_norm": 0.1785973757505417, "learning_rate": 1.3e-06, "loss": 1.8003, "step": 13 }, { "epoch": 1.4, "grad_norm": 0.12677137553691864, "learning_rate": 1.4000000000000001e-06, "loss": 1.875, "step": 14 }, { "epoch": 1.5, "grad_norm": 0.22960005700588226, "learning_rate": 1.5e-06, "loss": 1.8706, "step": 15 }, { "epoch": 1.5, "eval_loss": 2.073540210723877, "eval_runtime": 16.9094, "eval_samples_per_second": 1.005, "eval_steps_per_second": 0.177, "step": 15 }, { "epoch": 1.6, "grad_norm": 1.5457377433776855, "learning_rate": 1.6000000000000001e-06, "loss": 1.7857, "step": 16 }, { "epoch": 1.7, "grad_norm": 0.1759018898010254, "learning_rate": 1.7000000000000002e-06, "loss": 1.7879, "step": 17 }, { "epoch": 1.8, "grad_norm": 0.1202540397644043, "learning_rate": 1.8000000000000001e-06, "loss": 1.8152, "step": 18 }, { "epoch": 1.8, "eval_loss": 2.072761297225952, "eval_runtime": 16.852, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.178, "step": 18 }, { "epoch": 1.9, "grad_norm": 0.1538366675376892, "learning_rate": 1.9000000000000002e-06, "loss": 1.8449, "step": 19 }, { "epoch": 2.0, "grad_norm": 0.1657520830631256, "learning_rate": 2.0000000000000003e-06, "loss": 1.9541, "step": 20 }, { "epoch": 2.1, "grad_norm": 0.14708968997001648, "learning_rate": 2.1000000000000002e-06, "loss": 1.7398, "step": 21 }, { "epoch": 2.1, "eval_loss": 2.071575164794922, "eval_runtime": 16.8841, "eval_samples_per_second": 1.007, "eval_steps_per_second": 0.178, "step": 21 }, { "epoch": 2.2, "grad_norm": 0.1246899738907814, "learning_rate": 2.2e-06, "loss": 1.7001, "step": 22 }, { "epoch": 2.3, "grad_norm": 0.2179771065711975, "learning_rate": 2.3000000000000004e-06, "loss": 2.008, "step": 23 }, { "epoch": 2.4, "grad_norm": 0.17468346655368805, "learning_rate": 2.4000000000000003e-06, "loss": 1.7819, "step": 24 }, { "epoch": 2.4, "eval_loss": 2.071531057357788, "eval_runtime": 16.9083, "eval_samples_per_second": 1.005, "eval_steps_per_second": 0.177, "step": 24 }, { "epoch": 2.5, "grad_norm": 0.16188733279705048, "learning_rate": 2.5e-06, "loss": 1.8818, "step": 25 }, { "epoch": 2.6, "grad_norm": 0.23305261135101318, "learning_rate": 2.6e-06, "loss": 1.9217, "step": 26 }, { "epoch": 2.7, "grad_norm": 0.15702185034751892, "learning_rate": 2.7000000000000004e-06, "loss": 1.7759, "step": 27 }, { "epoch": 2.7, "eval_loss": 2.070585250854492, "eval_runtime": 16.9449, "eval_samples_per_second": 1.003, "eval_steps_per_second": 0.177, "step": 27 }, { "epoch": 2.8, "grad_norm": 0.196556955575943, "learning_rate": 2.8000000000000003e-06, "loss": 1.8756, "step": 28 }, { "epoch": 2.9, "grad_norm": 0.20446979999542236, "learning_rate": 2.9e-06, "loss": 1.9348, "step": 29 }, { "epoch": 3.0, "grad_norm": 0.15580403804779053, "learning_rate": 3e-06, "loss": 1.8823, "step": 30 }, { "epoch": 3.0, "eval_loss": 2.068406105041504, "eval_runtime": 16.8798, "eval_samples_per_second": 1.007, "eval_steps_per_second": 0.178, "step": 30 }, { "epoch": 3.1, "grad_norm": 0.2582396864891052, "learning_rate": 3.1000000000000004e-06, "loss": 1.9559, "step": 31 }, { "epoch": 3.2, "grad_norm": 0.28845566511154175, "learning_rate": 3.2000000000000003e-06, "loss": 1.8789, "step": 32 }, { "epoch": 3.3, "grad_norm": 0.20270375907421112, "learning_rate": 3.3000000000000006e-06, "loss": 1.7957, "step": 33 }, { "epoch": 3.3, "eval_loss": 2.064920663833618, "eval_runtime": 16.8875, "eval_samples_per_second": 1.007, "eval_steps_per_second": 0.178, "step": 33 }, { "epoch": 3.4, "grad_norm": 0.17135809361934662, "learning_rate": 3.4000000000000005e-06, "loss": 1.8674, "step": 34 }, { "epoch": 3.5, "grad_norm": 0.19458340108394623, "learning_rate": 3.5e-06, "loss": 1.7389, "step": 35 }, { "epoch": 3.6, "grad_norm": 0.14971010386943817, "learning_rate": 3.6000000000000003e-06, "loss": 1.8849, "step": 36 }, { "epoch": 3.6, "eval_loss": 2.0623722076416016, "eval_runtime": 16.8542, "eval_samples_per_second": 1.009, "eval_steps_per_second": 0.178, "step": 36 }, { "epoch": 3.7, "grad_norm": 0.1842203140258789, "learning_rate": 3.7e-06, "loss": 1.8708, "step": 37 }, { "epoch": 3.8, "grad_norm": 0.154820516705513, "learning_rate": 3.8000000000000005e-06, "loss": 1.8099, "step": 38 }, { "epoch": 3.9, "grad_norm": 0.22315488755702972, "learning_rate": 3.900000000000001e-06, "loss": 1.8567, "step": 39 }, { "epoch": 3.9, "eval_loss": 2.0567445755004883, "eval_runtime": 16.8981, "eval_samples_per_second": 1.006, "eval_steps_per_second": 0.178, "step": 39 }, { "epoch": 4.0, "grad_norm": 0.2417844533920288, "learning_rate": 4.000000000000001e-06, "loss": 1.9341, "step": 40 } ], "logging_steps": 1, "max_steps": 40, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 10, "total_flos": 4.3988113588800717e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }