{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.932811944543192, "eval_steps": 500, "global_step": 16500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08887308922858159, "grad_norm": 20.37746810913086, "learning_rate": 4.851878184619031e-05, "loss": 0.6617, "step": 500 }, { "epoch": 0.17774617845716317, "grad_norm": 8.832418441772461, "learning_rate": 4.703756369238061e-05, "loss": 0.5007, "step": 1000 }, { "epoch": 0.26661926768574473, "grad_norm": 112.71693420410156, "learning_rate": 4.555634553857092e-05, "loss": 0.4867, "step": 1500 }, { "epoch": 0.35549235691432635, "grad_norm": 0.7494603991508484, "learning_rate": 4.407512738476123e-05, "loss": 0.5113, "step": 2000 }, { "epoch": 0.4443654461429079, "grad_norm": 0.5181692838668823, "learning_rate": 4.259390923095153e-05, "loss": 0.557, "step": 2500 }, { "epoch": 0.5332385353714895, "grad_norm": 0.8127353191375732, "learning_rate": 4.1112691077141844e-05, "loss": 0.4746, "step": 3000 }, { "epoch": 0.6221116246000711, "grad_norm": 52.21469497680664, "learning_rate": 3.9631472923332156e-05, "loss": 0.4848, "step": 3500 }, { "epoch": 0.7109847138286527, "grad_norm": 3.922440767288208, "learning_rate": 3.815025476952246e-05, "loss": 0.4689, "step": 4000 }, { "epoch": 0.7998578030572343, "grad_norm": 39.936492919921875, "learning_rate": 3.6669036615712765e-05, "loss": 0.4724, "step": 4500 }, { "epoch": 0.8887308922858158, "grad_norm": 3.033518075942993, "learning_rate": 3.518781846190307e-05, "loss": 0.4384, "step": 5000 }, { "epoch": 0.9776039815143974, "grad_norm": 8.464579582214355, "learning_rate": 3.370660030809338e-05, "loss": 0.4534, "step": 5500 }, { "epoch": 1.066477070742979, "grad_norm": 0.24204222857952118, "learning_rate": 3.2225382154283686e-05, "loss": 0.3978, "step": 6000 }, { "epoch": 1.1553501599715605, "grad_norm": 0.057052597403526306, "learning_rate": 3.074416400047399e-05, "loss": 0.3852, "step": 6500 }, { "epoch": 1.2442232492001422, "grad_norm": 0.15880955755710602, "learning_rate": 2.92629458466643e-05, "loss": 0.3915, "step": 7000 }, { "epoch": 1.3330963384287238, "grad_norm": 0.09902948141098022, "learning_rate": 2.7781727692854603e-05, "loss": 0.3785, "step": 7500 }, { "epoch": 1.4219694276573054, "grad_norm": 0.15144290030002594, "learning_rate": 2.6300509539044908e-05, "loss": 0.3587, "step": 8000 }, { "epoch": 1.510842516885887, "grad_norm": 0.22604715824127197, "learning_rate": 2.481929138523522e-05, "loss": 0.3811, "step": 8500 }, { "epoch": 1.5997156061144686, "grad_norm": 14.215106964111328, "learning_rate": 2.3338073231425524e-05, "loss": 0.3591, "step": 9000 }, { "epoch": 1.68858869534305, "grad_norm": 0.2168210744857788, "learning_rate": 2.1856855077615832e-05, "loss": 0.37, "step": 9500 }, { "epoch": 1.7774617845716318, "grad_norm": 0.15380479395389557, "learning_rate": 2.0375636923806137e-05, "loss": 0.3492, "step": 10000 }, { "epoch": 1.8663348738002132, "grad_norm": 0.09666112810373306, "learning_rate": 1.889441876999645e-05, "loss": 0.3922, "step": 10500 }, { "epoch": 1.9552079630287948, "grad_norm": 0.7467890381813049, "learning_rate": 1.7413200616186753e-05, "loss": 0.3786, "step": 11000 }, { "epoch": 2.0440810522573765, "grad_norm": 1.9722317457199097, "learning_rate": 1.593198246237706e-05, "loss": 0.3258, "step": 11500 }, { "epoch": 2.132954141485958, "grad_norm": 0.07470700144767761, "learning_rate": 1.4450764308567366e-05, "loss": 0.2807, "step": 12000 }, { "epoch": 2.2218272307145397, "grad_norm": 0.11256339401006699, "learning_rate": 1.2969546154757672e-05, "loss": 0.2751, "step": 12500 }, { "epoch": 2.310700319943121, "grad_norm": 0.010098825208842754, "learning_rate": 1.148832800094798e-05, "loss": 0.2385, "step": 13000 }, { "epoch": 2.399573409171703, "grad_norm": 0.15252766013145447, "learning_rate": 1.0007109847138287e-05, "loss": 0.2379, "step": 13500 }, { "epoch": 2.4884464984002843, "grad_norm": 17.05082130432129, "learning_rate": 8.525891693328595e-06, "loss": 0.2706, "step": 14000 }, { "epoch": 2.5773195876288657, "grad_norm": 0.11130794882774353, "learning_rate": 7.0446735395189e-06, "loss": 0.2591, "step": 14500 }, { "epoch": 2.6661926768574475, "grad_norm": 74.68379974365234, "learning_rate": 5.5634553857092076e-06, "loss": 0.3314, "step": 15000 }, { "epoch": 2.7550657660860294, "grad_norm": 15.91612434387207, "learning_rate": 4.082237231899514e-06, "loss": 0.2514, "step": 15500 }, { "epoch": 2.8439388553146108, "grad_norm": 39.0718994140625, "learning_rate": 2.601019078089821e-06, "loss": 0.2358, "step": 16000 }, { "epoch": 2.932811944543192, "grad_norm": 0.09988761693239212, "learning_rate": 1.119800924280128e-06, "loss": 0.2359, "step": 16500 } ], "logging_steps": 500, "max_steps": 16878, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4794112531464192.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }