| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 250, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.10101010101010101, |
| "grad_norm": 5.43838668916967, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 0.768, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.20029328763484955, |
| "step": 5, |
| "valid_targets_mean": 3894.4, |
| "valid_targets_min": 1551 |
| }, |
| { |
| "epoch": 0.20202020202020202, |
| "grad_norm": 1.8173704060845222, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 0.6809, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.12393468618392944, |
| "step": 10, |
| "valid_targets_mean": 3313.9, |
| "valid_targets_min": 658 |
| }, |
| { |
| "epoch": 0.30303030303030304, |
| "grad_norm": 0.8994415488525478, |
| "learning_rate": 2.2400000000000002e-05, |
| "loss": 0.6367, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.15267133712768555, |
| "step": 15, |
| "valid_targets_mean": 3214.2, |
| "valid_targets_min": 748 |
| }, |
| { |
| "epoch": 0.40404040404040403, |
| "grad_norm": 0.5231491285791463, |
| "learning_rate": 3.0400000000000004e-05, |
| "loss": 0.603, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.1451815664768219, |
| "step": 20, |
| "valid_targets_mean": 4062.5, |
| "valid_targets_min": 752 |
| }, |
| { |
| "epoch": 0.5050505050505051, |
| "grad_norm": 0.48610136532802667, |
| "learning_rate": 3.8400000000000005e-05, |
| "loss": 0.5739, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.15977895259857178, |
| "step": 25, |
| "valid_targets_mean": 4920.6, |
| "valid_targets_min": 689 |
| }, |
| { |
| "epoch": 0.6060606060606061, |
| "grad_norm": 0.3657562059313739, |
| "learning_rate": 3.9968815283639625e-05, |
| "loss": 0.531, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.12183522433042526, |
| "step": 30, |
| "valid_targets_mean": 4170.1, |
| "valid_targets_min": 848 |
| }, |
| { |
| "epoch": 0.7070707070707071, |
| "grad_norm": 0.3117441303599555, |
| "learning_rate": 3.9842294026289565e-05, |
| "loss": 0.5137, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11094912886619568, |
| "step": 35, |
| "valid_targets_mean": 4246.0, |
| "valid_targets_min": 626 |
| }, |
| { |
| "epoch": 0.8080808080808081, |
| "grad_norm": 0.31427213112004465, |
| "learning_rate": 3.9619103106983835e-05, |
| "loss": 0.5033, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11524395644664764, |
| "step": 40, |
| "valid_targets_mean": 3335.3, |
| "valid_targets_min": 589 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 0.27843711420334, |
| "learning_rate": 3.930032988944623e-05, |
| "loss": 0.4921, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.12719261646270752, |
| "step": 45, |
| "valid_targets_mean": 4673.1, |
| "valid_targets_min": 842 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.35336756908591643, |
| "learning_rate": 3.888752740474962e-05, |
| "loss": 0.4725, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.23522385954856873, |
| "step": 50, |
| "valid_targets_mean": 4525.1, |
| "valid_targets_min": 618 |
| }, |
| { |
| "epoch": 1.101010101010101, |
| "grad_norm": 0.2618047514835116, |
| "learning_rate": 3.838270678510469e-05, |
| "loss": 0.4631, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.0997854694724083, |
| "step": 55, |
| "valid_targets_mean": 3254.4, |
| "valid_targets_min": 620 |
| }, |
| { |
| "epoch": 1.202020202020202, |
| "grad_norm": 0.2596681731982267, |
| "learning_rate": 3.778832746582596e-05, |
| "loss": 0.4695, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.15060186386108398, |
| "step": 60, |
| "valid_targets_mean": 5336.2, |
| "valid_targets_min": 994 |
| }, |
| { |
| "epoch": 1.303030303030303, |
| "grad_norm": 0.26134191299357834, |
| "learning_rate": 3.710728520321014e-05, |
| "loss": 0.4622, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10998662561178207, |
| "step": 65, |
| "valid_targets_mean": 3828.9, |
| "valid_targets_min": 813 |
| }, |
| { |
| "epoch": 1.404040404040404, |
| "grad_norm": 0.2505910129541225, |
| "learning_rate": 3.634289796670257e-05, |
| "loss": 0.4639, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11412525177001953, |
| "step": 70, |
| "valid_targets_mean": 4118.6, |
| "valid_targets_min": 589 |
| }, |
| { |
| "epoch": 1.5050505050505052, |
| "grad_norm": 0.28469306377718256, |
| "learning_rate": 3.549888977408359e-05, |
| "loss": 0.4547, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10224676132202148, |
| "step": 75, |
| "valid_targets_mean": 3621.8, |
| "valid_targets_min": 807 |
| }, |
| { |
| "epoch": 1.606060606060606, |
| "grad_norm": 0.26506094775038846, |
| "learning_rate": 3.457937254842823e-05, |
| "loss": 0.4478, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.1216074526309967, |
| "step": 80, |
| "valid_targets_mean": 4121.2, |
| "valid_targets_min": 900 |
| }, |
| { |
| "epoch": 1.7070707070707072, |
| "grad_norm": 0.27422041538604486, |
| "learning_rate": 3.3588826085230336e-05, |
| "loss": 0.4405, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.09807954728603363, |
| "step": 85, |
| "valid_targets_mean": 3189.9, |
| "valid_targets_min": 777 |
| }, |
| { |
| "epoch": 1.808080808080808, |
| "grad_norm": 0.25008604850493343, |
| "learning_rate": 3.253207622728921e-05, |
| "loss": 0.441, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11396118998527527, |
| "step": 90, |
| "valid_targets_mean": 4577.4, |
| "valid_targets_min": 875 |
| }, |
| { |
| "epoch": 1.9090909090909092, |
| "grad_norm": 0.26410650818418535, |
| "learning_rate": 3.141427135368864e-05, |
| "loss": 0.4447, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10761921852827072, |
| "step": 95, |
| "valid_targets_mean": 3803.5, |
| "valid_targets_min": 613 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.37011104936880673, |
| "learning_rate": 3.024085729741143e-05, |
| "loss": 0.4481, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.254130095243454, |
| "step": 100, |
| "valid_targets_mean": 4326.2, |
| "valid_targets_min": 550 |
| }, |
| { |
| "epoch": 2.101010101010101, |
| "grad_norm": 0.2633444829921788, |
| "learning_rate": 2.9017550813788616e-05, |
| "loss": 0.4286, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10730044543743134, |
| "step": 105, |
| "valid_targets_mean": 4681.4, |
| "valid_targets_min": 808 |
| }, |
| { |
| "epoch": 2.202020202020202, |
| "grad_norm": 0.27947047290967925, |
| "learning_rate": 2.7750311729042062e-05, |
| "loss": 0.4249, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11274366080760956, |
| "step": 110, |
| "valid_targets_mean": 4363.5, |
| "valid_targets_min": 721 |
| }, |
| { |
| "epoch": 2.303030303030303, |
| "grad_norm": 0.26878537952096837, |
| "learning_rate": 2.6445313904610227e-05, |
| "loss": 0.4319, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10825060307979584, |
| "step": 115, |
| "valid_targets_mean": 3570.9, |
| "valid_targets_min": 740 |
| }, |
| { |
| "epoch": 2.404040404040404, |
| "grad_norm": 0.27801838465637635, |
| "learning_rate": 2.510891515871581e-05, |
| "loss": 0.4335, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.09939642250537872, |
| "step": 120, |
| "valid_targets_mean": 3389.2, |
| "valid_targets_min": 712 |
| }, |
| { |
| "epoch": 2.505050505050505, |
| "grad_norm": 0.26781305081846457, |
| "learning_rate": 2.37476262917145e-05, |
| "loss": 0.4247, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.109991654753685, |
| "step": 125, |
| "valid_targets_mean": 4646.0, |
| "valid_targets_min": 748 |
| }, |
| { |
| "epoch": 2.606060606060606, |
| "grad_norm": 0.26237181084548494, |
| "learning_rate": 2.2368079366130028e-05, |
| "loss": 0.4187, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11524119973182678, |
| "step": 130, |
| "valid_targets_mean": 4932.1, |
| "valid_targets_min": 845 |
| }, |
| { |
| "epoch": 2.707070707070707, |
| "grad_norm": 0.2585522405239844, |
| "learning_rate": 2.097699539591227e-05, |
| "loss": 0.4274, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.08941829204559326, |
| "step": 135, |
| "valid_targets_mean": 3169.6, |
| "valid_targets_min": 832 |
| }, |
| { |
| "epoch": 2.808080808080808, |
| "grad_norm": 0.26016350721342923, |
| "learning_rate": 1.9581151602332865e-05, |
| "loss": 0.4177, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.1133737787604332, |
| "step": 140, |
| "valid_targets_mean": 5011.3, |
| "valid_targets_min": 1002 |
| }, |
| { |
| "epoch": 2.909090909090909, |
| "grad_norm": 0.27030558024007334, |
| "learning_rate": 1.8187348396044402e-05, |
| "loss": 0.4231, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10732856392860413, |
| "step": 145, |
| "valid_targets_mean": 4244.3, |
| "valid_targets_min": 849 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.3695239483214702, |
| "learning_rate": 1.6802376246163307e-05, |
| "loss": 0.4307, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.2080117017030716, |
| "step": 150, |
| "valid_targets_mean": 3466.7, |
| "valid_targets_min": 600 |
| }, |
| { |
| "epoch": 3.101010101010101, |
| "grad_norm": 0.2516175673157399, |
| "learning_rate": 1.5432982597786886e-05, |
| "loss": 0.4228, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11081644892692566, |
| "step": 155, |
| "valid_targets_mean": 4376.6, |
| "valid_targets_min": 1393 |
| }, |
| { |
| "epoch": 3.202020202020202, |
| "grad_norm": 0.25899041127457506, |
| "learning_rate": 1.4085838999119075e-05, |
| "loss": 0.4183, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10279497504234314, |
| "step": 160, |
| "valid_targets_mean": 3922.6, |
| "valid_targets_min": 1037 |
| }, |
| { |
| "epoch": 3.303030303030303, |
| "grad_norm": 0.2568302854985244, |
| "learning_rate": 1.2767508598358158e-05, |
| "loss": 0.4119, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10400432348251343, |
| "step": 165, |
| "valid_targets_mean": 4147.3, |
| "valid_targets_min": 840 |
| }, |
| { |
| "epoch": 3.404040404040404, |
| "grad_norm": 0.2575745984749214, |
| "learning_rate": 1.1484414168698547e-05, |
| "loss": 0.4109, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.1014273464679718, |
| "step": 170, |
| "valid_targets_mean": 3870.5, |
| "valid_targets_min": 738 |
| }, |
| { |
| "epoch": 3.505050505050505, |
| "grad_norm": 0.2708052183398422, |
| "learning_rate": 1.0242806817225344e-05, |
| "loss": 0.4183, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.0942554920911789, |
| "step": 175, |
| "valid_targets_mean": 4129.5, |
| "valid_targets_min": 691 |
| }, |
| { |
| "epoch": 3.606060606060606, |
| "grad_norm": 0.39589597035376406, |
| "learning_rate": 9.048735530148998e-06, |
| "loss": 0.4057, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10414694994688034, |
| "step": 180, |
| "valid_targets_mean": 3647.8, |
| "valid_targets_min": 1355 |
| }, |
| { |
| "epoch": 3.707070707070707, |
| "grad_norm": 0.2754352204699097, |
| "learning_rate": 7.908017702752504e-06, |
| "loss": 0.4167, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11268728971481323, |
| "step": 185, |
| "valid_targets_mean": 4025.8, |
| "valid_targets_min": 631 |
| }, |
| { |
| "epoch": 3.808080808080808, |
| "grad_norm": 0.26307739642578865, |
| "learning_rate": 6.826210797626389e-06, |
| "loss": 0.4158, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.08769197762012482, |
| "step": 190, |
| "valid_targets_mean": 3541.8, |
| "valid_targets_min": 1132 |
| }, |
| { |
| "epoch": 3.909090909090909, |
| "grad_norm": 0.25965429680059315, |
| "learning_rate": 5.8085852692695864e-06, |
| "loss": 0.4088, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10568307340145111, |
| "step": 195, |
| "valid_targets_mean": 4072.8, |
| "valid_targets_min": 658 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.37112181378749864, |
| "learning_rate": 4.8600988869648745e-06, |
| "loss": 0.4077, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.18167036771774292, |
| "step": 200, |
| "valid_targets_mean": 3388.4, |
| "valid_targets_min": 757 |
| }, |
| { |
| "epoch": 4.101010101010101, |
| "grad_norm": 0.24190724490489265, |
| "learning_rate": 3.985372581025333e-06, |
| "loss": 0.3992, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.09219323098659515, |
| "step": 205, |
| "valid_targets_mean": 3395.9, |
| "valid_targets_min": 1280 |
| }, |
| { |
| "epoch": 4.202020202020202, |
| "grad_norm": 0.26150260454768026, |
| "learning_rate": 3.1886679300863156e-06, |
| "loss": 0.407, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.09850673377513885, |
| "step": 210, |
| "valid_targets_mean": 3216.6, |
| "valid_targets_min": 784 |
| }, |
| { |
| "epoch": 4.303030303030303, |
| "grad_norm": 0.26872823184659334, |
| "learning_rate": 2.473866399122733e-06, |
| "loss": 0.4133, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10747270286083221, |
| "step": 215, |
| "valid_targets_mean": 3487.2, |
| "valid_targets_min": 773 |
| }, |
| { |
| "epoch": 4.404040404040404, |
| "grad_norm": 0.2549088626047512, |
| "learning_rate": 1.8444504293418286e-06, |
| "loss": 0.4124, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10171965509653091, |
| "step": 220, |
| "valid_targets_mean": 3727.9, |
| "valid_targets_min": 928 |
| }, |
| { |
| "epoch": 4.505050505050505, |
| "grad_norm": 0.32301587125223463, |
| "learning_rate": 1.3034864720797112e-06, |
| "loss": 0.4125, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.11444351077079773, |
| "step": 225, |
| "valid_targets_mean": 3867.6, |
| "valid_targets_min": 791 |
| }, |
| { |
| "epoch": 4.606060606060606, |
| "grad_norm": 0.24811664458418323, |
| "learning_rate": 8.536100493586552e-07, |
| "loss": 0.4023, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.0989425927400589, |
| "step": 230, |
| "valid_targets_mean": 3649.8, |
| "valid_targets_min": 675 |
| }, |
| { |
| "epoch": 4.707070707070707, |
| "grad_norm": 0.24999205236156857, |
| "learning_rate": 4.970129138887347e-07, |
| "loss": 0.4078, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.09448540210723877, |
| "step": 235, |
| "valid_targets_mean": 3779.3, |
| "valid_targets_min": 613 |
| }, |
| { |
| "epoch": 4.808080808080808, |
| "grad_norm": 0.2533525398749002, |
| "learning_rate": 2.3543237106894434e-07, |
| "loss": 0.4138, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.10717539489269257, |
| "step": 240, |
| "valid_targets_mean": 3630.1, |
| "valid_targets_min": 1185 |
| }, |
| { |
| "epoch": 4.909090909090909, |
| "grad_norm": 0.2324383918083103, |
| "learning_rate": 7.01428150099126e-08, |
| "loss": 0.4099, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.09807487577199936, |
| "step": 245, |
| "valid_targets_mean": 4431.1, |
| "valid_targets_min": 1588 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.3564637724019629, |
| "learning_rate": 1.949519813915224e-09, |
| "loss": 0.4103, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.20247647166252136, |
| "step": 250, |
| "valid_targets_mean": 3850.3, |
| "valid_targets_min": 771 |
| }, |
| { |
| "epoch": 5.0, |
| "loss_nan_ranks": 0, |
| "loss_rank_avg": 0.20247647166252136, |
| "step": 250, |
| "total_flos": 4.268418036190413e+17, |
| "train_loss": 0.4559480676651001, |
| "train_runtime": 5383.7809, |
| "train_samples_per_second": 2.935, |
| "train_steps_per_second": 0.046, |
| "valid_targets_mean": 3850.3, |
| "valid_targets_min": 771 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 250, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.268418036190413e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|