{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.992, "eval_steps": 16, "global_step": 62, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 1.4910566806793213, "learning_rate": 1e-05, "loss": 88.7097, "step": 1 }, { "epoch": 0.016, "eval_loss": 11.088533401489258, "eval_runtime": 1.124, "eval_samples_per_second": 94.304, "eval_steps_per_second": 24.021, "step": 1 }, { "epoch": 0.032, "grad_norm": 1.4108612537384033, "learning_rate": 2e-05, "loss": 88.7293, "step": 2 }, { "epoch": 0.048, "grad_norm": 1.486616849899292, "learning_rate": 3e-05, "loss": 88.7434, "step": 3 }, { "epoch": 0.064, "grad_norm": 1.7622555494308472, "learning_rate": 4e-05, "loss": 88.7179, "step": 4 }, { "epoch": 0.08, "grad_norm": 1.5639805793762207, "learning_rate": 5e-05, "loss": 88.6803, "step": 5 }, { "epoch": 0.096, "grad_norm": 1.7514230012893677, "learning_rate": 6e-05, "loss": 88.7313, "step": 6 }, { "epoch": 0.112, "grad_norm": 1.9987614154815674, "learning_rate": 7e-05, "loss": 88.7164, "step": 7 }, { "epoch": 0.128, "grad_norm": 1.940069317817688, "learning_rate": 8e-05, "loss": 88.7398, "step": 8 }, { "epoch": 0.144, "grad_norm": 1.8506841659545898, "learning_rate": 9e-05, "loss": 88.6965, "step": 9 }, { "epoch": 0.16, "grad_norm": 1.9289461374282837, "learning_rate": 0.0001, "loss": 88.8021, "step": 10 }, { "epoch": 0.176, "grad_norm": 1.9998689889907837, "learning_rate": 0.00011000000000000002, "loss": 88.6776, "step": 11 }, { "epoch": 0.192, "grad_norm": 2.1637938022613525, "learning_rate": 0.00012, "loss": 88.7963, "step": 12 }, { "epoch": 0.208, "grad_norm": 2.258723497390747, "learning_rate": 0.00013000000000000002, "loss": 88.7175, "step": 13 }, { "epoch": 0.224, "grad_norm": 2.3891043663024902, "learning_rate": 0.00014, "loss": 88.6413, "step": 14 }, { "epoch": 0.24, "grad_norm": 2.612598180770874, "learning_rate": 0.00015000000000000001, "loss": 88.7464, "step": 15 }, { "epoch": 0.256, "grad_norm": 2.633556365966797, "learning_rate": 0.00016, "loss": 88.7045, "step": 16 }, { "epoch": 0.256, "eval_loss": 11.084848403930664, "eval_runtime": 0.8206, "eval_samples_per_second": 129.18, "eval_steps_per_second": 32.904, "step": 16 }, { "epoch": 0.272, "grad_norm": 1.444272756576538, "learning_rate": 0.00017, "loss": 88.719, "step": 17 }, { "epoch": 0.288, "grad_norm": 1.4536869525909424, "learning_rate": 0.00018, "loss": 88.7756, "step": 18 }, { "epoch": 0.304, "grad_norm": 1.6990916728973389, "learning_rate": 0.00019, "loss": 88.719, "step": 19 }, { "epoch": 0.32, "grad_norm": 1.6907556056976318, "learning_rate": 0.0002, "loss": 88.6939, "step": 20 }, { "epoch": 0.336, "grad_norm": 1.7854983806610107, "learning_rate": 0.00019972037971811802, "loss": 88.6928, "step": 21 }, { "epoch": 0.352, "grad_norm": 1.7495782375335693, "learning_rate": 0.00019888308262251285, "loss": 88.7139, "step": 22 }, { "epoch": 0.368, "grad_norm": 1.8135170936584473, "learning_rate": 0.00019749279121818235, "loss": 88.6808, "step": 23 }, { "epoch": 0.384, "grad_norm": 1.9571936130523682, "learning_rate": 0.0001955572805786141, "loss": 88.7278, "step": 24 }, { "epoch": 0.4, "grad_norm": 1.9375219345092773, "learning_rate": 0.00019308737486442045, "loss": 88.661, "step": 25 }, { "epoch": 0.416, "grad_norm": 2.0492427349090576, "learning_rate": 0.0001900968867902419, "loss": 88.6735, "step": 26 }, { "epoch": 0.432, "grad_norm": 2.057321786880493, "learning_rate": 0.00018660254037844388, "loss": 88.5661, "step": 27 }, { "epoch": 0.448, "grad_norm": 2.227236747741699, "learning_rate": 0.0001826238774315995, "loss": 88.6781, "step": 28 }, { "epoch": 0.464, "grad_norm": 2.3857967853546143, "learning_rate": 0.000178183148246803, "loss": 88.6794, "step": 29 }, { "epoch": 0.48, "grad_norm": 2.7329936027526855, "learning_rate": 0.00017330518718298264, "loss": 88.6323, "step": 30 }, { "epoch": 0.496, "grad_norm": 3.2216603755950928, "learning_rate": 0.00016801727377709194, "loss": 88.5737, "step": 31 }, { "epoch": 0.512, "grad_norm": 1.5606772899627686, "learning_rate": 0.00016234898018587337, "loss": 88.6774, "step": 32 }, { "epoch": 0.512, "eval_loss": 11.075674057006836, "eval_runtime": 0.8047, "eval_samples_per_second": 131.72, "eval_steps_per_second": 33.551, "step": 32 }, { "epoch": 0.528, "grad_norm": 1.6445367336273193, "learning_rate": 0.0001563320058063622, "loss": 88.6368, "step": 33 }, { "epoch": 0.544, "grad_norm": 1.6764920949935913, "learning_rate": 0.00015000000000000001, "loss": 88.6352, "step": 34 }, { "epoch": 0.56, "grad_norm": 1.6223012208938599, "learning_rate": 0.00014338837391175582, "loss": 88.6562, "step": 35 }, { "epoch": 0.576, "grad_norm": 1.6213066577911377, "learning_rate": 0.00013653410243663952, "loss": 88.6551, "step": 36 }, { "epoch": 0.592, "grad_norm": 1.6623308658599854, "learning_rate": 0.00012947551744109043, "loss": 88.6895, "step": 37 }, { "epoch": 0.608, "grad_norm": 1.8002474308013916, "learning_rate": 0.00012225209339563145, "loss": 88.674, "step": 38 }, { "epoch": 0.624, "grad_norm": 1.8450833559036255, "learning_rate": 0.00011490422661761744, "loss": 88.5731, "step": 39 }, { "epoch": 0.64, "grad_norm": 1.9053994417190552, "learning_rate": 0.00010747300935864243, "loss": 88.6383, "step": 40 }, { "epoch": 0.656, "grad_norm": 1.9040968418121338, "learning_rate": 0.0001, "loss": 88.6602, "step": 41 }, { "epoch": 0.672, "grad_norm": 2.163938522338867, "learning_rate": 9.252699064135758e-05, "loss": 88.5704, "step": 42 }, { "epoch": 0.688, "grad_norm": 2.156372547149658, "learning_rate": 8.509577338238255e-05, "loss": 88.4837, "step": 43 }, { "epoch": 0.704, "grad_norm": 2.1990439891815186, "learning_rate": 7.774790660436858e-05, "loss": 88.6662, "step": 44 }, { "epoch": 0.72, "grad_norm": 2.474072217941284, "learning_rate": 7.052448255890957e-05, "loss": 88.5809, "step": 45 }, { "epoch": 0.736, "grad_norm": 2.7647547721862793, "learning_rate": 6.34658975633605e-05, "loss": 88.6076, "step": 46 }, { "epoch": 0.752, "grad_norm": 2.5142226219177246, "learning_rate": 5.6611626088244194e-05, "loss": 88.7863, "step": 47 }, { "epoch": 0.768, "grad_norm": 1.5332900285720825, "learning_rate": 5.000000000000002e-05, "loss": 88.6006, "step": 48 }, { "epoch": 0.768, "eval_loss": 11.07020378112793, "eval_runtime": 0.612, "eval_samples_per_second": 173.197, "eval_steps_per_second": 44.116, "step": 48 }, { "epoch": 0.784, "grad_norm": 1.6119565963745117, "learning_rate": 4.3667994193637796e-05, "loss": 88.609, "step": 49 }, { "epoch": 0.8, "grad_norm": 1.591171145439148, "learning_rate": 3.7651019814126654e-05, "loss": 88.6227, "step": 50 }, { "epoch": 0.816, "grad_norm": 1.7413899898529053, "learning_rate": 3.198272622290804e-05, "loss": 88.6242, "step": 51 }, { "epoch": 0.832, "grad_norm": 1.8746490478515625, "learning_rate": 2.669481281701739e-05, "loss": 88.5348, "step": 52 }, { "epoch": 0.848, "grad_norm": 1.946791172027588, "learning_rate": 2.181685175319702e-05, "loss": 88.5242, "step": 53 }, { "epoch": 0.864, "grad_norm": 1.9902241230010986, "learning_rate": 1.7376122568400532e-05, "loss": 88.6423, "step": 54 }, { "epoch": 0.88, "grad_norm": 1.9461379051208496, "learning_rate": 1.339745962155613e-05, "loss": 88.6454, "step": 55 }, { "epoch": 0.896, "grad_norm": 2.044666290283203, "learning_rate": 9.903113209758096e-06, "loss": 88.5542, "step": 56 }, { "epoch": 0.912, "grad_norm": 1.9682707786560059, "learning_rate": 6.9126251355795864e-06, "loss": 88.6255, "step": 57 }, { "epoch": 0.928, "grad_norm": 2.2248926162719727, "learning_rate": 4.442719421385922e-06, "loss": 88.5276, "step": 58 }, { "epoch": 0.944, "grad_norm": 2.1064934730529785, "learning_rate": 2.5072087818176382e-06, "loss": 88.6175, "step": 59 }, { "epoch": 0.96, "grad_norm": 2.4718239307403564, "learning_rate": 1.1169173774871478e-06, "loss": 88.5009, "step": 60 }, { "epoch": 0.976, "grad_norm": 2.4647696018218994, "learning_rate": 2.7962028188198706e-07, "loss": 88.6283, "step": 61 }, { "epoch": 0.992, "grad_norm": 3.2278006076812744, "learning_rate": 0.0, "loss": 88.4972, "step": 62 } ], "logging_steps": 1, "max_steps": 62, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1856734494720.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }