{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.743589743589745, "eval_steps": 500, "global_step": 190, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05128205128205128, "grad_norm": 198.0, "learning_rate": 1.0526315789473684e-05, "loss": 36.097, "step": 1 }, { "epoch": 0.2564102564102564, "grad_norm": 142.0, "learning_rate": 5.2631578947368424e-05, "loss": 33.9388, "step": 5 }, { "epoch": 0.5128205128205128, "grad_norm": 25.625, "learning_rate": 0.00010526315789473685, "loss": 24.944, "step": 10 }, { "epoch": 0.7692307692307693, "grad_norm": 14.9375, "learning_rate": 0.00015789473684210527, "loss": 18.539, "step": 15 }, { "epoch": 0.9743589743589743, "eval_loss": 8.623817443847656, "eval_runtime": 0.2547, "eval_samples_per_second": 39.26, "eval_steps_per_second": 3.926, "step": 19 }, { "epoch": 1.0256410256410255, "grad_norm": 4.1875, "learning_rate": 0.00019998312416333227, "loss": 15.3835, "step": 20 }, { "epoch": 1.282051282051282, "grad_norm": 3.28125, "learning_rate": 0.00019939306773179497, "loss": 14.2252, "step": 25 }, { "epoch": 1.5384615384615383, "grad_norm": 6.4375, "learning_rate": 0.0001979649067087574, "loss": 13.4082, "step": 30 }, { "epoch": 1.7948717948717947, "grad_norm": 9.9375, "learning_rate": 0.00019571068366759143, "loss": 11.8891, "step": 35 }, { "epoch": 2.0, "eval_loss": 6.5198655128479, "eval_runtime": 0.2367, "eval_samples_per_second": 42.251, "eval_steps_per_second": 4.225, "step": 39 }, { "epoch": 2.051282051282051, "grad_norm": 16.375, "learning_rate": 0.00019264940672148018, "loss": 9.8637, "step": 40 }, { "epoch": 2.3076923076923075, "grad_norm": 20.25, "learning_rate": 0.00018880688924275378, "loss": 6.7911, "step": 45 }, { "epoch": 2.564102564102564, "grad_norm": 7.21875, "learning_rate": 0.00018421553219875658, "loss": 3.3014, "step": 50 }, { "epoch": 2.8205128205128203, "grad_norm": 5.4375, "learning_rate": 0.00017891405093963938, "loss": 2.3149, "step": 55 }, { "epoch": 2.9743589743589745, "eval_loss": 3.2759299278259277, "eval_runtime": 0.2543, "eval_samples_per_second": 39.323, "eval_steps_per_second": 3.932, "step": 58 }, { "epoch": 3.076923076923077, "grad_norm": 2.703125, "learning_rate": 0.0001729471487418621, "loss": 1.9629, "step": 60 }, { "epoch": 3.3333333333333335, "grad_norm": 2.078125, "learning_rate": 0.00016636513986016213, "loss": 1.7292, "step": 65 }, { "epoch": 3.58974358974359, "grad_norm": 0.9765625, "learning_rate": 0.00015922352526649803, "loss": 1.6224, "step": 70 }, { "epoch": 3.8461538461538463, "grad_norm": 0.7109375, "learning_rate": 0.00015158252465343242, "loss": 1.5266, "step": 75 }, { "epoch": 4.0, "eval_loss": 2.8999454975128174, "eval_runtime": 0.2358, "eval_samples_per_second": 42.402, "eval_steps_per_second": 4.24, "step": 78 }, { "epoch": 4.102564102564102, "grad_norm": 0.56640625, "learning_rate": 0.00014350656864820733, "loss": 1.4469, "step": 80 }, { "epoch": 4.358974358974359, "grad_norm": 0.5625, "learning_rate": 0.00013506375551927547, "loss": 1.3937, "step": 85 }, { "epoch": 4.615384615384615, "grad_norm": 0.703125, "learning_rate": 0.00012632527695645993, "loss": 1.3638, "step": 90 }, { "epoch": 4.871794871794872, "grad_norm": 0.416015625, "learning_rate": 0.00011736481776669306, "loss": 1.3332, "step": 95 }, { "epoch": 4.9743589743589745, "eval_loss": 2.796644926071167, "eval_runtime": 0.2552, "eval_samples_per_second": 39.191, "eval_steps_per_second": 3.919, "step": 97 }, { "epoch": 5.128205128205128, "grad_norm": 0.80859375, "learning_rate": 0.00010825793454723325, "loss": 1.3079, "step": 100 }, { "epoch": 5.384615384615385, "grad_norm": 0.69140625, "learning_rate": 9.908141857552737e-05, "loss": 1.2787, "step": 105 }, { "epoch": 5.641025641025641, "grad_norm": 0.482421875, "learning_rate": 8.991264828797319e-05, "loss": 1.2515, "step": 110 }, { "epoch": 5.897435897435898, "grad_norm": 0.6171875, "learning_rate": 8.082893680762619e-05, "loss": 1.2502, "step": 115 }, { "epoch": 6.0, "eval_loss": 2.7460193634033203, "eval_runtime": 0.2367, "eval_samples_per_second": 42.241, "eval_steps_per_second": 4.224, "step": 117 }, { "epoch": 6.153846153846154, "grad_norm": 0.466796875, "learning_rate": 7.190688002264308e-05, "loss": 1.2261, "step": 120 }, { "epoch": 6.410256410256411, "grad_norm": 0.6640625, "learning_rate": 6.322171071261071e-05, "loss": 1.2127, "step": 125 }, { "epoch": 6.666666666666667, "grad_norm": 0.5078125, "learning_rate": 5.484666416891109e-05, "loss": 1.2119, "step": 130 }, { "epoch": 6.923076923076923, "grad_norm": 0.578125, "learning_rate": 4.685236065835443e-05, "loss": 1.2007, "step": 135 }, { "epoch": 6.9743589743589745, "eval_loss": 2.733168840408325, "eval_runtime": 0.2595, "eval_samples_per_second": 38.536, "eval_steps_per_second": 3.854, "step": 136 }, { "epoch": 7.17948717948718, "grad_norm": 0.55859375, "learning_rate": 3.9306209937284346e-05, "loss": 1.1979, "step": 140 }, { "epoch": 7.435897435897436, "grad_norm": 0.55859375, "learning_rate": 3.227184283742591e-05, "loss": 1.1863, "step": 145 }, { "epoch": 7.6923076923076925, "grad_norm": 0.5859375, "learning_rate": 2.5808574716471856e-05, "loss": 1.1845, "step": 150 }, { "epoch": 7.948717948717949, "grad_norm": 0.416015625, "learning_rate": 1.9970905297711606e-05, "loss": 1.1904, "step": 155 }, { "epoch": 8.0, "eval_loss": 2.728332281112671, "eval_runtime": 0.2381, "eval_samples_per_second": 41.991, "eval_steps_per_second": 4.199, "step": 156 }, { "epoch": 8.205128205128204, "grad_norm": 0.44140625, "learning_rate": 1.4808059116167305e-05, "loss": 1.1728, "step": 160 }, { "epoch": 8.461538461538462, "grad_norm": 0.53515625, "learning_rate": 1.0363570446297999e-05, "loss": 1.184, "step": 165 }, { "epoch": 8.717948717948717, "grad_norm": 0.65234375, "learning_rate": 6.674916211254289e-06, "loss": 1.1746, "step": 170 }, { "epoch": 8.974358974358974, "grad_norm": 0.392578125, "learning_rate": 3.7731999690749585e-06, "loss": 1.1866, "step": 175 }, { "epoch": 8.974358974358974, "eval_loss": 2.7323360443115234, "eval_runtime": 0.2585, "eval_samples_per_second": 38.691, "eval_steps_per_second": 3.869, "step": 175 }, { "epoch": 9.23076923076923, "grad_norm": 0.396484375, "learning_rate": 1.6828896405244988e-06, "loss": 1.175, "step": 180 }, { "epoch": 9.487179487179487, "grad_norm": 0.404296875, "learning_rate": 4.216111901092501e-07, "loss": 1.1839, "step": 185 }, { "epoch": 9.743589743589745, "grad_norm": 0.4765625, "learning_rate": 0.0, "loss": 1.1715, "step": 190 }, { "epoch": 9.743589743589745, "eval_loss": 2.725883960723877, "eval_runtime": 0.2361, "eval_samples_per_second": 42.355, "eval_steps_per_second": 4.236, "step": 190 }, { "epoch": 9.743589743589745, "step": 190, "total_flos": 5.793437974192456e+17, "train_loss": 5.009378814697266, "train_runtime": 464.8218, "train_samples_per_second": 26.204, "train_steps_per_second": 0.409 } ], "logging_steps": 5, "max_steps": 190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 5.793437974192456e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }