{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9948186528497409, "eval_steps": 12, "global_step": 48, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02072538860103627, "grad_norm": 0.012328565120697021, "learning_rate": 4.000000000000001e-06, "loss": 12.4632, "step": 1 }, { "epoch": 0.02072538860103627, "eval_loss": 12.459209442138672, "eval_runtime": 1.4447, "eval_samples_per_second": 56.758, "eval_steps_per_second": 14.536, "step": 1 }, { "epoch": 0.04145077720207254, "grad_norm": 0.01481240801513195, "learning_rate": 8.000000000000001e-06, "loss": 12.4598, "step": 2 }, { "epoch": 0.06217616580310881, "grad_norm": 0.01405641995370388, "learning_rate": 1.2e-05, "loss": 12.461, "step": 3 }, { "epoch": 0.08290155440414508, "grad_norm": 0.012295316904783249, "learning_rate": 1.6000000000000003e-05, "loss": 12.4607, "step": 4 }, { "epoch": 0.10362694300518134, "grad_norm": 0.01401484664529562, "learning_rate": 2e-05, "loss": 12.4673, "step": 5 }, { "epoch": 0.12435233160621761, "grad_norm": 0.015558955259621143, "learning_rate": 2.4e-05, "loss": 12.4628, "step": 6 }, { "epoch": 0.14507772020725387, "grad_norm": 0.013355869799852371, "learning_rate": 2.8000000000000003e-05, "loss": 12.4672, "step": 7 }, { "epoch": 0.16580310880829016, "grad_norm": 0.015484556555747986, "learning_rate": 3.2000000000000005e-05, "loss": 12.4612, "step": 8 }, { "epoch": 0.18652849740932642, "grad_norm": 0.01674036495387554, "learning_rate": 3.6e-05, "loss": 12.4657, "step": 9 }, { "epoch": 0.20725388601036268, "grad_norm": 0.016482409089803696, "learning_rate": 4e-05, "loss": 12.4615, "step": 10 }, { "epoch": 0.22797927461139897, "grad_norm": 0.01787281408905983, "learning_rate": 4.4000000000000006e-05, "loss": 12.4637, "step": 11 }, { "epoch": 0.24870466321243523, "grad_norm": 0.01791117526590824, "learning_rate": 4.8e-05, "loss": 12.4663, "step": 12 }, { "epoch": 0.24870466321243523, "eval_loss": 12.459169387817383, "eval_runtime": 1.4466, "eval_samples_per_second": 56.684, "eval_steps_per_second": 14.517, "step": 12 }, { "epoch": 0.2694300518134715, "grad_norm": 0.012630539946258068, "learning_rate": 5.2000000000000004e-05, "loss": 12.4626, "step": 13 }, { "epoch": 0.29015544041450775, "grad_norm": 0.012216604314744473, "learning_rate": 5.6000000000000006e-05, "loss": 12.4612, "step": 14 }, { "epoch": 0.31088082901554404, "grad_norm": 0.01380347739905119, "learning_rate": 6e-05, "loss": 12.4622, "step": 15 }, { "epoch": 0.3316062176165803, "grad_norm": 0.014256793074309826, "learning_rate": 6.400000000000001e-05, "loss": 12.4618, "step": 16 }, { "epoch": 0.35233160621761656, "grad_norm": 0.015153720043599606, "learning_rate": 6.800000000000001e-05, "loss": 12.4671, "step": 17 }, { "epoch": 0.37305699481865284, "grad_norm": 0.013849603943526745, "learning_rate": 7.2e-05, "loss": 12.4667, "step": 18 }, { "epoch": 0.39378238341968913, "grad_norm": 0.013394423760473728, "learning_rate": 7.6e-05, "loss": 12.4601, "step": 19 }, { "epoch": 0.41450777202072536, "grad_norm": 0.015159090980887413, "learning_rate": 8e-05, "loss": 12.4657, "step": 20 }, { "epoch": 0.43523316062176165, "grad_norm": 0.015923170372843742, "learning_rate": 8.4e-05, "loss": 12.4655, "step": 21 }, { "epoch": 0.45595854922279794, "grad_norm": 0.01685705967247486, "learning_rate": 8.800000000000001e-05, "loss": 12.4621, "step": 22 }, { "epoch": 0.47668393782383417, "grad_norm": 0.015810972079634666, "learning_rate": 9.200000000000001e-05, "loss": 12.465, "step": 23 }, { "epoch": 0.49740932642487046, "grad_norm": 0.018734809011220932, "learning_rate": 9.6e-05, "loss": 12.4651, "step": 24 }, { "epoch": 0.49740932642487046, "eval_loss": 12.45895767211914, "eval_runtime": 1.4297, "eval_samples_per_second": 57.355, "eval_steps_per_second": 14.688, "step": 24 }, { "epoch": 0.5181347150259067, "grad_norm": 0.01412452757358551, "learning_rate": 0.0001, "loss": 12.4627, "step": 25 }, { "epoch": 0.538860103626943, "grad_norm": 0.01330703403800726, "learning_rate": 0.00010400000000000001, "loss": 12.4633, "step": 26 }, { "epoch": 0.5595854922279793, "grad_norm": 0.01628568395972252, "learning_rate": 0.00010800000000000001, "loss": 12.4593, "step": 27 }, { "epoch": 0.5803108808290155, "grad_norm": 0.013267126865684986, "learning_rate": 0.00011200000000000001, "loss": 12.4597, "step": 28 }, { "epoch": 0.6010362694300518, "grad_norm": 0.01633607968688011, "learning_rate": 0.000116, "loss": 12.4574, "step": 29 }, { "epoch": 0.6217616580310881, "grad_norm": 0.014100078493356705, "learning_rate": 0.00012, "loss": 12.4624, "step": 30 }, { "epoch": 0.6424870466321243, "grad_norm": 0.0167540293186903, "learning_rate": 0.000124, "loss": 12.4602, "step": 31 }, { "epoch": 0.6632124352331606, "grad_norm": 0.015121635980904102, "learning_rate": 0.00012800000000000002, "loss": 12.4627, "step": 32 }, { "epoch": 0.6839378238341969, "grad_norm": 0.015222931280732155, "learning_rate": 0.000132, "loss": 12.4621, "step": 33 }, { "epoch": 0.7046632124352331, "grad_norm": 0.01850767433643341, "learning_rate": 0.00013600000000000003, "loss": 12.4652, "step": 34 }, { "epoch": 0.7253886010362695, "grad_norm": 0.021301083266735077, "learning_rate": 0.00014, "loss": 12.4616, "step": 35 }, { "epoch": 0.7461139896373057, "grad_norm": 0.02175988256931305, "learning_rate": 0.000144, "loss": 12.4653, "step": 36 }, { "epoch": 0.7461139896373057, "eval_loss": 12.458399772644043, "eval_runtime": 1.4242, "eval_samples_per_second": 57.575, "eval_steps_per_second": 14.745, "step": 36 }, { "epoch": 0.7668393782383419, "grad_norm": 0.016086047515273094, "learning_rate": 0.000148, "loss": 12.4601, "step": 37 }, { "epoch": 0.7875647668393783, "grad_norm": 0.016342192888259888, "learning_rate": 0.000152, "loss": 12.46, "step": 38 }, { "epoch": 0.8082901554404145, "grad_norm": 0.018003815785050392, "learning_rate": 0.00015600000000000002, "loss": 12.4616, "step": 39 }, { "epoch": 0.8290155440414507, "grad_norm": 0.020604344084858894, "learning_rate": 0.00016, "loss": 12.4571, "step": 40 }, { "epoch": 0.8497409326424871, "grad_norm": 0.01911165565252304, "learning_rate": 0.000164, "loss": 12.4576, "step": 41 }, { "epoch": 0.8704663212435233, "grad_norm": 0.022435059770941734, "learning_rate": 0.000168, "loss": 12.4594, "step": 42 }, { "epoch": 0.8911917098445595, "grad_norm": 0.0239774901419878, "learning_rate": 0.000172, "loss": 12.4606, "step": 43 }, { "epoch": 0.9119170984455959, "grad_norm": 0.02675757370889187, "learning_rate": 0.00017600000000000002, "loss": 12.4617, "step": 44 }, { "epoch": 0.9326424870466321, "grad_norm": 0.028698239475488663, "learning_rate": 0.00018, "loss": 12.4596, "step": 45 }, { "epoch": 0.9533678756476683, "grad_norm": 0.02775721438229084, "learning_rate": 0.00018400000000000003, "loss": 12.4607, "step": 46 }, { "epoch": 0.9740932642487047, "grad_norm": 0.03137590363621712, "learning_rate": 0.000188, "loss": 12.4636, "step": 47 }, { "epoch": 0.9948186528497409, "grad_norm": 0.035989392548799515, "learning_rate": 0.000192, "loss": 12.4666, "step": 48 }, { "epoch": 0.9948186528497409, "eval_loss": 12.45715045928955, "eval_runtime": 1.4531, "eval_samples_per_second": 56.429, "eval_steps_per_second": 14.451, "step": 48 } ], "logging_steps": 1, "max_steps": 48, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 236760072192.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }