{ "best_metric": 5.346867561340332, "best_model_checkpoint": "./results/models/mistral-dna/checkpoint-22675", "epoch": 5.0, "eval_steps": 500, "global_step": 22675, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11025358324145534, "grad_norm": 0.142578125, "learning_rate": 0.0003991179713340684, "loss": 6.6131, "step": 500 }, { "epoch": 0.2205071664829107, "grad_norm": 0.1298828125, "learning_rate": 0.00039823594266813673, "loss": 6.0212, "step": 1000 }, { "epoch": 0.33076074972436603, "grad_norm": 0.12255859375, "learning_rate": 0.00039735391400220506, "loss": 5.8931, "step": 1500 }, { "epoch": 0.4410143329658214, "grad_norm": 0.11572265625, "learning_rate": 0.00039647188533627344, "loss": 5.822, "step": 2000 }, { "epoch": 0.5512679162072768, "grad_norm": 0.125, "learning_rate": 0.0003955898566703418, "loss": 5.7678, "step": 2500 }, { "epoch": 0.6615214994487321, "grad_norm": 0.119140625, "learning_rate": 0.00039470782800441015, "loss": 5.7204, "step": 3000 }, { "epoch": 0.7717750826901875, "grad_norm": 0.12255859375, "learning_rate": 0.00039382579933847854, "loss": 5.7025, "step": 3500 }, { "epoch": 0.8820286659316428, "grad_norm": 0.1376953125, "learning_rate": 0.00039294377067254687, "loss": 5.6733, "step": 4000 }, { "epoch": 0.9922822491730982, "grad_norm": 0.1328125, "learning_rate": 0.00039206174200661525, "loss": 5.6491, "step": 4500 }, { "epoch": 1.0, "eval_loss": 5.551197528839111, "eval_runtime": 3.3539, "eval_samples_per_second": 86.765, "eval_steps_per_second": 1.491, "step": 4535 }, { "epoch": 1.1025358324145536, "grad_norm": 0.126953125, "learning_rate": 0.0003911797133406836, "loss": 5.6279, "step": 5000 }, { "epoch": 1.2127894156560088, "grad_norm": 0.1435546875, "learning_rate": 0.00039029768467475196, "loss": 5.6068, "step": 5500 }, { "epoch": 1.3230429988974641, "grad_norm": 0.1455078125, "learning_rate": 0.0003894156560088203, "loss": 5.6068, "step": 6000 }, { "epoch": 1.4332965821389196, "grad_norm": 0.130859375, "learning_rate": 0.00038853362734288867, "loss": 5.5658, "step": 6500 }, { "epoch": 1.543550165380375, "grad_norm": 0.1318359375, "learning_rate": 0.000387651598676957, "loss": 5.5665, "step": 7000 }, { "epoch": 1.6538037486218302, "grad_norm": 0.138671875, "learning_rate": 0.0003867695700110254, "loss": 5.5565, "step": 7500 }, { "epoch": 1.7640573318632855, "grad_norm": 0.1376953125, "learning_rate": 0.00038588754134509377, "loss": 5.5477, "step": 8000 }, { "epoch": 1.8743109151047408, "grad_norm": 0.1298828125, "learning_rate": 0.0003850055126791621, "loss": 5.5295, "step": 8500 }, { "epoch": 1.9845644983461963, "grad_norm": 0.140625, "learning_rate": 0.0003841234840132304, "loss": 5.5208, "step": 9000 }, { "epoch": 2.0, "eval_loss": 5.451062202453613, "eval_runtime": 3.2969, "eval_samples_per_second": 88.264, "eval_steps_per_second": 1.517, "step": 9070 }, { "epoch": 2.0948180815876514, "grad_norm": 0.138671875, "learning_rate": 0.0003832414553472988, "loss": 5.5017, "step": 9500 }, { "epoch": 2.205071664829107, "grad_norm": 0.142578125, "learning_rate": 0.0003823594266813672, "loss": 5.4918, "step": 10000 }, { "epoch": 2.3153252480705624, "grad_norm": 0.1357421875, "learning_rate": 0.0003814773980154355, "loss": 5.4806, "step": 10500 }, { "epoch": 2.4255788313120177, "grad_norm": 0.13671875, "learning_rate": 0.00038059536934950385, "loss": 5.4877, "step": 11000 }, { "epoch": 2.535832414553473, "grad_norm": 0.138671875, "learning_rate": 0.00037971334068357223, "loss": 5.4696, "step": 11500 }, { "epoch": 2.6460859977949283, "grad_norm": 0.15625, "learning_rate": 0.0003788313120176406, "loss": 5.4781, "step": 12000 }, { "epoch": 2.7563395810363835, "grad_norm": 0.140625, "learning_rate": 0.00037794928335170894, "loss": 5.4748, "step": 12500 }, { "epoch": 2.8665931642778393, "grad_norm": 0.1591796875, "learning_rate": 0.0003770672546857773, "loss": 5.4608, "step": 13000 }, { "epoch": 2.9768467475192946, "grad_norm": 0.1923828125, "learning_rate": 0.00037618522601984565, "loss": 5.453, "step": 13500 }, { "epoch": 3.0, "eval_loss": 5.399576663970947, "eval_runtime": 3.4053, "eval_samples_per_second": 85.455, "eval_steps_per_second": 1.468, "step": 13605 }, { "epoch": 3.08710033076075, "grad_norm": 0.1865234375, "learning_rate": 0.00037530319735391404, "loss": 5.4278, "step": 14000 }, { "epoch": 3.197353914002205, "grad_norm": 0.158203125, "learning_rate": 0.00037442116868798236, "loss": 5.4226, "step": 14500 }, { "epoch": 3.3076074972436604, "grad_norm": 0.1484375, "learning_rate": 0.00037353914002205075, "loss": 5.4334, "step": 15000 }, { "epoch": 3.4178610804851157, "grad_norm": 0.162109375, "learning_rate": 0.0003726571113561191, "loss": 5.4321, "step": 15500 }, { "epoch": 3.528114663726571, "grad_norm": 0.1572265625, "learning_rate": 0.00037177508269018746, "loss": 5.4277, "step": 16000 }, { "epoch": 3.6383682469680263, "grad_norm": 0.1591796875, "learning_rate": 0.0003708930540242558, "loss": 5.4193, "step": 16500 }, { "epoch": 3.7486218302094816, "grad_norm": 0.1611328125, "learning_rate": 0.00037001102535832417, "loss": 5.4226, "step": 17000 }, { "epoch": 3.8588754134509373, "grad_norm": 0.154296875, "learning_rate": 0.00036912899669239255, "loss": 5.4214, "step": 17500 }, { "epoch": 3.9691289966923926, "grad_norm": 0.1533203125, "learning_rate": 0.0003682469680264609, "loss": 5.4218, "step": 18000 }, { "epoch": 4.0, "eval_loss": 5.367885112762451, "eval_runtime": 3.6662, "eval_samples_per_second": 79.373, "eval_steps_per_second": 1.364, "step": 18140 }, { "epoch": 4.0793825799338475, "grad_norm": 0.1748046875, "learning_rate": 0.0003673649393605292, "loss": 5.4067, "step": 18500 }, { "epoch": 4.189636163175303, "grad_norm": 0.1572265625, "learning_rate": 0.0003664829106945976, "loss": 5.3963, "step": 19000 }, { "epoch": 4.299889746416759, "grad_norm": 0.158203125, "learning_rate": 0.000365600882028666, "loss": 5.3929, "step": 19500 }, { "epoch": 4.410143329658214, "grad_norm": 0.1630859375, "learning_rate": 0.0003647188533627343, "loss": 5.3888, "step": 20000 }, { "epoch": 4.5203969128996695, "grad_norm": 0.16015625, "learning_rate": 0.00036383682469680263, "loss": 5.3982, "step": 20500 }, { "epoch": 4.630650496141125, "grad_norm": 0.173828125, "learning_rate": 0.000362954796030871, "loss": 5.3754, "step": 21000 }, { "epoch": 4.74090407938258, "grad_norm": 0.1591796875, "learning_rate": 0.0003620727673649394, "loss": 5.3854, "step": 21500 }, { "epoch": 4.851157662624035, "grad_norm": 0.1748046875, "learning_rate": 0.00036119073869900773, "loss": 5.3876, "step": 22000 }, { "epoch": 4.961411245865491, "grad_norm": 0.1767578125, "learning_rate": 0.0003603087100330761, "loss": 5.3899, "step": 22500 }, { "epoch": 5.0, "eval_loss": 5.346867561340332, "eval_runtime": 3.7974, "eval_samples_per_second": 76.63, "eval_steps_per_second": 1.317, "step": 22675 } ], "logging_steps": 500, "max_steps": 226750, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.976338069294799e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }