{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9972602739726028, "eval_steps": 500, "global_step": 182, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005479452054794521, "grad_norm": 26.082090377807617, "learning_rate": 1.0526315789473684e-05, "loss": 1.7702, "step": 1 }, { "epoch": 0.0273972602739726, "grad_norm": 1.9692896604537964, "learning_rate": 5.2631578947368424e-05, "loss": 1.7256, "step": 5 }, { "epoch": 0.0547945205479452, "grad_norm": 1.138301968574524, "learning_rate": 0.00010526315789473685, "loss": 1.6367, "step": 10 }, { "epoch": 0.0821917808219178, "grad_norm": 1.4542856216430664, "learning_rate": 0.00015789473684210527, "loss": 1.5304, "step": 15 }, { "epoch": 0.1095890410958904, "grad_norm": 1.3035956621170044, "learning_rate": 0.0001999814270411335, "loss": 1.4406, "step": 20 }, { "epoch": 0.136986301369863, "grad_norm": 0.91375333070755, "learning_rate": 0.00019933209759891317, "loss": 1.3465, "step": 25 }, { "epoch": 0.1643835616438356, "grad_norm": 0.6928793787956238, "learning_rate": 0.00019776100779878345, "loss": 1.2954, "step": 30 }, { "epoch": 0.1917808219178082, "grad_norm": 0.412971556186676, "learning_rate": 0.00019528273669757972, "loss": 1.2706, "step": 35 }, { "epoch": 0.2191780821917808, "grad_norm": 0.44875410199165344, "learning_rate": 0.00019192028161552847, "loss": 1.2453, "step": 40 }, { "epoch": 0.2465753424657534, "grad_norm": 0.2584894001483917, "learning_rate": 0.0001877048447307252, "loss": 1.2217, "step": 45 }, { "epoch": 0.273972602739726, "grad_norm": 0.2488676905632019, "learning_rate": 0.00018267554353596025, "loss": 1.1993, "step": 50 }, { "epoch": 0.3013698630136986, "grad_norm": 0.24647608399391174, "learning_rate": 0.00017687904784473188, "loss": 1.1941, "step": 55 }, { "epoch": 0.3287671232876712, "grad_norm": 0.3073161840438843, "learning_rate": 0.00017036914671487852, "loss": 1.1967, "step": 60 }, { "epoch": 0.3561643835616438, "grad_norm": 0.2503437399864197, "learning_rate": 0.00016320624930859904, "loss": 1.1889, "step": 65 }, { "epoch": 0.3835616438356164, "grad_norm": 0.2762184739112854, "learning_rate": 0.00015545682432067067, "loss": 1.17, "step": 70 }, { "epoch": 0.410958904109589, "grad_norm": 0.25446659326553345, "learning_rate": 0.00014719278317673655, "loss": 1.1707, "step": 75 }, { "epoch": 0.4383561643835616, "grad_norm": 0.2860565483570099, "learning_rate": 0.00013849081272532544, "loss": 1.1699, "step": 80 }, { "epoch": 0.4657534246575342, "grad_norm": 0.26391202211380005, "learning_rate": 0.00012943166361594242, "loss": 1.1557, "step": 85 }, { "epoch": 0.4931506849315068, "grad_norm": 0.27276208996772766, "learning_rate": 0.00012009940096678452, "loss": 1.1546, "step": 90 }, { "epoch": 0.5205479452054794, "grad_norm": 0.3492254912853241, "learning_rate": 0.00011058062427557229, "loss": 1.1549, "step": 95 }, { "epoch": 0.547945205479452, "grad_norm": 0.2584497928619385, "learning_rate": 0.00010096366381239808, "loss": 1.1406, "step": 100 }, { "epoch": 0.5753424657534246, "grad_norm": 0.3101947605609894, "learning_rate": 9.133776095173015e-05, "loss": 1.1437, "step": 105 }, { "epoch": 0.6027397260273972, "grad_norm": 0.33753615617752075, "learning_rate": 8.179224004974857e-05, "loss": 1.131, "step": 110 }, { "epoch": 0.6301369863013698, "grad_norm": 0.28552067279815674, "learning_rate": 7.24156795516461e-05, "loss": 1.1375, "step": 115 }, { "epoch": 0.6575342465753424, "grad_norm": 0.2846600413322449, "learning_rate": 6.32950900206708e-05, "loss": 1.1499, "step": 120 }, { "epoch": 0.684931506849315, "grad_norm": 0.2724830210208893, "learning_rate": 5.451510671645807e-05, "loss": 1.1364, "step": 125 }, { "epoch": 0.7123287671232876, "grad_norm": 0.23818813264369965, "learning_rate": 4.61572042151878e-05, "loss": 1.1307, "step": 130 }, { "epoch": 0.7397260273972602, "grad_norm": 0.3082185685634613, "learning_rate": 3.829894035956306e-05, "loss": 1.1422, "step": 135 }, { "epoch": 0.7671232876712328, "grad_norm": 0.26166653633117676, "learning_rate": 3.101323655443882e-05, "loss": 1.1303, "step": 140 }, { "epoch": 0.7945205479452054, "grad_norm": 0.2606567442417145, "learning_rate": 2.4367701086656624e-05, "loss": 1.1238, "step": 145 }, { "epoch": 0.821917808219178, "grad_norm": 0.27404358983039856, "learning_rate": 1.8424001748393905e-05, "loss": 1.1377, "step": 150 }, { "epoch": 0.8493150684931506, "grad_norm": 0.24582931399345398, "learning_rate": 1.3237293585821786e-05, "loss": 1.1381, "step": 155 }, { "epoch": 0.8767123287671232, "grad_norm": 0.25582000613212585, "learning_rate": 8.855707083324183e-06, "loss": 1.1419, "step": 160 }, { "epoch": 0.9041095890410958, "grad_norm": 0.2417668253183365, "learning_rate": 5.319901532714877e-06, "loss": 1.1263, "step": 165 }, { "epoch": 0.9315068493150684, "grad_norm": 0.23573410511016846, "learning_rate": 2.66268773199988e-06, "loss": 1.1412, "step": 170 }, { "epoch": 0.958904109589041, "grad_norm": 0.23247532546520233, "learning_rate": 9.087235148824368e-07, "loss": 1.1378, "step": 175 }, { "epoch": 0.9863013698630136, "grad_norm": 0.2713688015937805, "learning_rate": 7.428493637002821e-08, "loss": 1.1394, "step": 180 }, { "epoch": 0.9972602739726028, "eval_loss": 1.641967535018921, "eval_runtime": 0.6331, "eval_samples_per_second": 14.215, "eval_steps_per_second": 1.579, "step": 182 }, { "epoch": 0.9972602739726028, "step": 182, "total_flos": 8.060051304564654e+17, "train_loss": 1.2164833742183643, "train_runtime": 689.7556, "train_samples_per_second": 50.685, "train_steps_per_second": 0.264 } ], "logging_steps": 5, "max_steps": 182, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.060051304564654e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }