{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4984025559105431, "eval_steps": 39, "global_step": 39, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.06435385346412659, "learning_rate": 1e-05, "loss": 0.6574, "step": 1 }, { "epoch": 0.01, "eval_loss": 0.7461566925048828, "eval_runtime": 151.7973, "eval_samples_per_second": 1.482, "eval_steps_per_second": 0.376, "step": 1 }, { "epoch": 0.03, "grad_norm": 0.062134526669979095, "learning_rate": 2e-05, "loss": 0.6897, "step": 2 }, { "epoch": 0.04, "grad_norm": 0.06962593644857407, "learning_rate": 3e-05, "loss": 0.6974, "step": 3 }, { "epoch": 0.05, "grad_norm": 0.0730314701795578, "learning_rate": 4e-05, "loss": 0.7454, "step": 4 }, { "epoch": 0.06, "grad_norm": 0.07659862190485, "learning_rate": 5e-05, "loss": 0.6962, "step": 5 }, { "epoch": 0.08, "grad_norm": 0.07314729690551758, "learning_rate": 6e-05, "loss": 0.6602, "step": 6 }, { "epoch": 0.09, "grad_norm": 0.0892895832657814, "learning_rate": 7e-05, "loss": 0.6594, "step": 7 }, { "epoch": 0.1, "grad_norm": 0.11520648747682571, "learning_rate": 8e-05, "loss": 0.7406, "step": 8 }, { "epoch": 0.12, "grad_norm": 0.1333557516336441, "learning_rate": 9e-05, "loss": 0.663, "step": 9 }, { "epoch": 0.13, "grad_norm": 0.14017663896083832, "learning_rate": 0.0001, "loss": 0.6185, "step": 10 }, { "epoch": 0.14, "grad_norm": 0.1453818678855896, "learning_rate": 0.00011000000000000002, "loss": 0.602, "step": 11 }, { "epoch": 0.15, "grad_norm": 0.14279265701770782, "learning_rate": 0.00012, "loss": 0.6151, "step": 12 }, { "epoch": 0.17, "grad_norm": 0.11934962123632431, "learning_rate": 0.00013000000000000002, "loss": 0.5085, "step": 13 }, { "epoch": 0.18, "grad_norm": 0.11375121772289276, "learning_rate": 0.00014, "loss": 0.4823, "step": 14 }, { "epoch": 0.19, "grad_norm": 0.09707039594650269, "learning_rate": 0.00015000000000000001, "loss": 0.4281, "step": 15 }, { "epoch": 0.2, "grad_norm": 0.11333765089511871, "learning_rate": 0.00016, "loss": 0.4417, "step": 16 }, { "epoch": 0.22, "grad_norm": 0.09926053136587143, "learning_rate": 0.00017, "loss": 0.3654, "step": 17 }, { "epoch": 0.23, "grad_norm": 0.1027929037809372, "learning_rate": 0.00018, "loss": 0.3748, "step": 18 }, { "epoch": 0.24, "grad_norm": 0.09768980741500854, "learning_rate": 0.00019, "loss": 0.3306, "step": 19 }, { "epoch": 0.26, "grad_norm": 0.08072198927402496, "learning_rate": 0.0002, "loss": 0.355, "step": 20 }, { "epoch": 0.27, "grad_norm": 0.06444709748029709, "learning_rate": 0.00019997332081116373, "loss": 0.3152, "step": 21 }, { "epoch": 0.28, "grad_norm": 0.0576409213244915, "learning_rate": 0.00019989329748023725, "loss": 0.2674, "step": 22 }, { "epoch": 0.29, "grad_norm": 0.08624427020549774, "learning_rate": 0.0001997599727063717, "loss": 0.2807, "step": 23 }, { "epoch": 0.31, "grad_norm": 0.08828990906476974, "learning_rate": 0.00019957341762950344, "loss": 0.2736, "step": 24 }, { "epoch": 0.32, "grad_norm": 0.0641990676522255, "learning_rate": 0.00019933373179239502, "loss": 0.2427, "step": 25 }, { "epoch": 0.33, "grad_norm": 0.05487390235066414, "learning_rate": 0.0001990410430875205, "loss": 0.2737, "step": 26 }, { "epoch": 0.35, "grad_norm": 0.0666637197136879, "learning_rate": 0.00019869550768882455, "loss": 0.2806, "step": 27 }, { "epoch": 0.36, "grad_norm": 0.053027719259262085, "learning_rate": 0.0001982973099683902, "loss": 0.2664, "step": 28 }, { "epoch": 0.37, "grad_norm": 0.055217448621988297, "learning_rate": 0.0001978466623980609, "loss": 0.309, "step": 29 }, { "epoch": 0.38, "grad_norm": 0.05493360385298729, "learning_rate": 0.0001973438054360693, "loss": 0.2318, "step": 30 }, { "epoch": 0.4, "grad_norm": 0.0376153439283371, "learning_rate": 0.00019678900739873226, "loss": 0.2049, "step": 31 }, { "epoch": 0.41, "grad_norm": 0.03811287507414818, "learning_rate": 0.00019618256431728194, "loss": 0.2771, "step": 32 }, { "epoch": 0.42, "grad_norm": 0.04185184836387634, "learning_rate": 0.000195524799779908, "loss": 0.1843, "step": 33 }, { "epoch": 0.43, "grad_norm": 0.04743755981326103, "learning_rate": 0.0001948160647590966, "loss": 0.2414, "step": 34 }, { "epoch": 0.45, "grad_norm": 0.03343382850289345, "learning_rate": 0.00019405673742435678, "loss": 0.2007, "step": 35 }, { "epoch": 0.46, "grad_norm": 0.03178093209862709, "learning_rate": 0.00019324722294043558, "loss": 0.1898, "step": 36 }, { "epoch": 0.47, "grad_norm": 0.03482227399945259, "learning_rate": 0.0001923879532511287, "loss": 0.2657, "step": 37 }, { "epoch": 0.49, "grad_norm": 0.03727172687649727, "learning_rate": 0.0001914793868488021, "loss": 0.2185, "step": 38 }, { "epoch": 0.5, "grad_norm": 0.0382206104695797, "learning_rate": 0.00019052200852974819, "loss": 0.1778, "step": 39 }, { "epoch": 0.5, "eval_loss": 0.2122737020254135, "eval_runtime": 154.5774, "eval_samples_per_second": 1.456, "eval_steps_per_second": 0.369, "step": 39 } ], "logging_steps": 1, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 39, "total_flos": 5.128431174313574e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }