{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 870, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14367816091954022, "grad_norm": 1.424246072769165, "learning_rate": 5.517241379310345e-05, "loss": 2.0273, "step": 25 }, { "epoch": 0.28735632183908044, "grad_norm": 0.9430229663848877, "learning_rate": 0.00011264367816091954, "loss": 1.8285, "step": 50 }, { "epoch": 0.43103448275862066, "grad_norm": 0.7553120851516724, "learning_rate": 0.00017011494252873563, "loss": 1.6756, "step": 75 }, { "epoch": 0.5747126436781609, "grad_norm": 0.7407281398773193, "learning_rate": 0.00019693486590038314, "loss": 1.5606, "step": 100 }, { "epoch": 0.7183908045977011, "grad_norm": 0.7509495615959167, "learning_rate": 0.0001905491698595147, "loss": 1.5438, "step": 125 }, { "epoch": 0.8620689655172413, "grad_norm": 0.7058537602424622, "learning_rate": 0.00018416347381864626, "loss": 1.4816, "step": 150 }, { "epoch": 1.0057471264367817, "grad_norm": 0.6202262043952942, "learning_rate": 0.00017777777777777779, "loss": 1.5232, "step": 175 }, { "epoch": 1.1494252873563218, "grad_norm": 0.7314026355743408, "learning_rate": 0.00017139208173690932, "loss": 1.4954, "step": 200 }, { "epoch": 1.293103448275862, "grad_norm": 0.7846217155456543, "learning_rate": 0.00016500638569604087, "loss": 1.4199, "step": 225 }, { "epoch": 1.4367816091954024, "grad_norm": 0.751372754573822, "learning_rate": 0.00015862068965517243, "loss": 1.48, "step": 250 }, { "epoch": 1.5804597701149425, "grad_norm": 0.7667829990386963, "learning_rate": 0.00015223499361430396, "loss": 1.4346, "step": 275 }, { "epoch": 1.7241379310344827, "grad_norm": 0.7370414733886719, "learning_rate": 0.00014584929757343552, "loss": 1.4334, "step": 300 }, { "epoch": 1.867816091954023, "grad_norm": 0.9422834515571594, "learning_rate": 0.00013946360153256705, "loss": 1.3969, "step": 325 }, { "epoch": 2.0114942528735633, "grad_norm": 0.7596230506896973, "learning_rate": 0.0001330779054916986, "loss": 1.4241, "step": 350 }, { "epoch": 2.1551724137931036, "grad_norm": 0.9657158851623535, "learning_rate": 0.00012669220945083016, "loss": 1.3115, "step": 375 }, { "epoch": 2.2988505747126435, "grad_norm": 0.9675273895263672, "learning_rate": 0.00012030651340996169, "loss": 1.3382, "step": 400 }, { "epoch": 2.442528735632184, "grad_norm": 0.9593296647071838, "learning_rate": 0.00011392081736909323, "loss": 1.3843, "step": 425 }, { "epoch": 2.586206896551724, "grad_norm": 0.8661421537399292, "learning_rate": 0.00010753512132822479, "loss": 1.3161, "step": 450 }, { "epoch": 2.7298850574712645, "grad_norm": 0.9521291851997375, "learning_rate": 0.00010114942528735633, "loss": 1.3355, "step": 475 }, { "epoch": 2.873563218390805, "grad_norm": 1.0091650485992432, "learning_rate": 9.476372924648788e-05, "loss": 1.3052, "step": 500 }, { "epoch": 3.0172413793103448, "grad_norm": 0.9030539989471436, "learning_rate": 8.837803320561942e-05, "loss": 1.2682, "step": 525 }, { "epoch": 3.160919540229885, "grad_norm": 1.0906364917755127, "learning_rate": 8.199233716475096e-05, "loss": 1.2097, "step": 550 }, { "epoch": 3.3045977011494254, "grad_norm": 1.1765050888061523, "learning_rate": 7.56066411238825e-05, "loss": 1.2564, "step": 575 }, { "epoch": 3.4482758620689653, "grad_norm": 1.1693239212036133, "learning_rate": 6.922094508301405e-05, "loss": 1.2207, "step": 600 }, { "epoch": 3.5919540229885056, "grad_norm": 1.0836124420166016, "learning_rate": 6.283524904214559e-05, "loss": 1.2338, "step": 625 }, { "epoch": 3.735632183908046, "grad_norm": 1.0297603607177734, "learning_rate": 5.644955300127714e-05, "loss": 1.2179, "step": 650 }, { "epoch": 3.8793103448275863, "grad_norm": 1.3402975797653198, "learning_rate": 5.0063856960408687e-05, "loss": 1.2363, "step": 675 }, { "epoch": 4.022988505747127, "grad_norm": 1.1158450841903687, "learning_rate": 4.367816091954024e-05, "loss": 1.1844, "step": 700 }, { "epoch": 4.166666666666667, "grad_norm": 1.3297624588012695, "learning_rate": 3.729246487867178e-05, "loss": 1.1176, "step": 725 }, { "epoch": 4.310344827586207, "grad_norm": 1.3470237255096436, "learning_rate": 3.090676883780332e-05, "loss": 1.1639, "step": 750 }, { "epoch": 4.454022988505747, "grad_norm": 1.293717861175537, "learning_rate": 2.4521072796934867e-05, "loss": 1.1668, "step": 775 }, { "epoch": 4.597701149425287, "grad_norm": 1.400448203086853, "learning_rate": 1.8135376756066413e-05, "loss": 1.1751, "step": 800 }, { "epoch": 4.741379310344827, "grad_norm": 1.1660029888153076, "learning_rate": 1.1749680715197957e-05, "loss": 1.1406, "step": 825 }, { "epoch": 4.885057471264368, "grad_norm": 1.311295747756958, "learning_rate": 5.3639846743295025e-06, "loss": 1.1406, "step": 850 } ], "logging_steps": 25, "max_steps": 870, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.6060942324334592e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }