{ "best_metric": 0.39903518557548523, "best_model_checkpoint": "mikhail_panzo/zlm_b128_le5_s12000/checkpoint-3000", "epoch": 5.029109947643979, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08376963350785341, "grad_norm": 6.650441646575928, "learning_rate": 2.4500000000000004e-07, "loss": 1.1088, "step": 50 }, { "epoch": 0.16753926701570682, "grad_norm": 13.608625411987305, "learning_rate": 4.95e-07, "loss": 1.1077, "step": 100 }, { "epoch": 0.2513089005235602, "grad_norm": 2.7265217304229736, "learning_rate": 7.450000000000001e-07, "loss": 0.9759, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 5.083759307861328, "learning_rate": 9.950000000000002e-07, "loss": 0.8705, "step": 200 }, { "epoch": 0.418848167539267, "grad_norm": 1.8121678829193115, "learning_rate": 1.2450000000000002e-06, "loss": 0.8185, "step": 250 }, { "epoch": 0.5026178010471204, "grad_norm": 2.275899648666382, "learning_rate": 1.495e-06, "loss": 0.784, "step": 300 }, { "epoch": 0.5863874345549738, "grad_norm": 2.2861170768737793, "learning_rate": 1.745e-06, "loss": 0.7545, "step": 350 }, { "epoch": 0.6701570680628273, "grad_norm": 3.1991238594055176, "learning_rate": 1.9950000000000004e-06, "loss": 0.7227, "step": 400 }, { "epoch": 0.7539267015706806, "grad_norm": 1.843092679977417, "learning_rate": 2.245e-06, "loss": 0.6829, "step": 450 }, { "epoch": 0.837696335078534, "grad_norm": 1.4700403213500977, "learning_rate": 2.4950000000000003e-06, "loss": 0.6644, "step": 500 }, { "epoch": 0.837696335078534, "eval_loss": 0.5727962851524353, "eval_runtime": 293.5061, "eval_samples_per_second": 28.923, "eval_steps_per_second": 3.618, "step": 500 }, { "epoch": 0.9214659685863874, "grad_norm": 1.4362367391586304, "learning_rate": 2.7450000000000004e-06, "loss": 0.6487, "step": 550 }, { "epoch": 1.0052356020942408, "grad_norm": 1.8555901050567627, "learning_rate": 2.995e-06, "loss": 0.6301, "step": 600 }, { "epoch": 1.0890052356020943, "grad_norm": 1.9496757984161377, "learning_rate": 3.2450000000000003e-06, "loss": 0.6237, "step": 650 }, { "epoch": 1.1727748691099475, "grad_norm": 3.0705084800720215, "learning_rate": 3.495e-06, "loss": 0.6156, "step": 700 }, { "epoch": 1.256544502617801, "grad_norm": 2.2215688228607178, "learning_rate": 3.745e-06, "loss": 0.5945, "step": 750 }, { "epoch": 1.3403141361256545, "grad_norm": 2.021375894546509, "learning_rate": 3.995000000000001e-06, "loss": 0.5891, "step": 800 }, { "epoch": 1.4240837696335078, "grad_norm": 2.0354769229888916, "learning_rate": 4.245e-06, "loss": 0.5781, "step": 850 }, { "epoch": 1.5078534031413613, "grad_norm": 1.8674426078796387, "learning_rate": 4.495e-06, "loss": 0.5677, "step": 900 }, { "epoch": 1.5916230366492146, "grad_norm": 2.9264817237854004, "learning_rate": 4.745e-06, "loss": 0.5576, "step": 950 }, { "epoch": 1.675392670157068, "grad_norm": 1.9513416290283203, "learning_rate": 4.9950000000000005e-06, "loss": 0.5594, "step": 1000 }, { "epoch": 1.675392670157068, "eval_loss": 0.48248979449272156, "eval_runtime": 294.2627, "eval_samples_per_second": 28.848, "eval_steps_per_second": 3.609, "step": 1000 }, { "epoch": 1.7591623036649215, "grad_norm": 1.934941291809082, "learning_rate": 5.245e-06, "loss": 0.5571, "step": 1050 }, { "epoch": 1.8429319371727748, "grad_norm": 1.7669196128845215, "learning_rate": 5.495000000000001e-06, "loss": 0.5477, "step": 1100 }, { "epoch": 1.9267015706806283, "grad_norm": 3.954806089401245, "learning_rate": 5.745000000000001e-06, "loss": 0.5294, "step": 1150 }, { "epoch": 2.0104712041884816, "grad_norm": 2.8323569297790527, "learning_rate": 5.995000000000001e-06, "loss": 0.5261, "step": 1200 }, { "epoch": 2.094240837696335, "grad_norm": 2.193530797958374, "learning_rate": 6.245000000000001e-06, "loss": 0.5262, "step": 1250 }, { "epoch": 2.1780104712041886, "grad_norm": 1.5814998149871826, "learning_rate": 6.4950000000000005e-06, "loss": 0.5204, "step": 1300 }, { "epoch": 2.261780104712042, "grad_norm": 5.6610918045043945, "learning_rate": 6.745000000000001e-06, "loss": 0.5157, "step": 1350 }, { "epoch": 2.345549738219895, "grad_norm": 9.156899452209473, "learning_rate": 6.995000000000001e-06, "loss": 0.5178, "step": 1400 }, { "epoch": 2.4293193717277486, "grad_norm": 2.3170149326324463, "learning_rate": 7.245000000000001e-06, "loss": 0.5163, "step": 1450 }, { "epoch": 2.513089005235602, "grad_norm": 2.1565325260162354, "learning_rate": 7.495000000000001e-06, "loss": 0.5042, "step": 1500 }, { "epoch": 2.513089005235602, "eval_loss": 0.4464746415615082, "eval_runtime": 293.8295, "eval_samples_per_second": 28.891, "eval_steps_per_second": 3.614, "step": 1500 }, { "epoch": 2.5997905759162303, "grad_norm": 2.4758172035217285, "learning_rate": 7.745e-06, "loss": 0.5031, "step": 1550 }, { "epoch": 2.683560209424084, "grad_norm": 2.1877381801605225, "learning_rate": 7.990000000000001e-06, "loss": 0.4994, "step": 1600 }, { "epoch": 2.7673298429319373, "grad_norm": 2.4486210346221924, "learning_rate": 8.24e-06, "loss": 0.4965, "step": 1650 }, { "epoch": 2.8510994764397903, "grad_norm": 2.596200704574585, "learning_rate": 8.49e-06, "loss": 0.4988, "step": 1700 }, { "epoch": 2.934869109947644, "grad_norm": 1.7787096500396729, "learning_rate": 8.740000000000001e-06, "loss": 0.4947, "step": 1750 }, { "epoch": 3.0186387434554973, "grad_norm": 2.09403133392334, "learning_rate": 8.99e-06, "loss": 0.4845, "step": 1800 }, { "epoch": 3.102408376963351, "grad_norm": 2.0056636333465576, "learning_rate": 9.240000000000001e-06, "loss": 0.4833, "step": 1850 }, { "epoch": 3.1861780104712043, "grad_norm": 3.200199842453003, "learning_rate": 9.49e-06, "loss": 0.484, "step": 1900 }, { "epoch": 3.269947643979058, "grad_norm": 2.5462379455566406, "learning_rate": 9.74e-06, "loss": 0.4726, "step": 1950 }, { "epoch": 3.353717277486911, "grad_norm": 1.3994622230529785, "learning_rate": 9.990000000000001e-06, "loss": 0.4795, "step": 2000 }, { "epoch": 3.353717277486911, "eval_loss": 0.42617055773735046, "eval_runtime": 269.3627, "eval_samples_per_second": 31.515, "eval_steps_per_second": 3.943, "step": 2000 }, { "epoch": 3.4374869109947643, "grad_norm": 1.8979076147079468, "learning_rate": 9.952e-06, "loss": 0.4792, "step": 2050 }, { "epoch": 3.521256544502618, "grad_norm": 1.4493324756622314, "learning_rate": 9.902000000000001e-06, "loss": 0.4751, "step": 2100 }, { "epoch": 3.6050261780104713, "grad_norm": 3.3374929428100586, "learning_rate": 9.852e-06, "loss": 0.4736, "step": 2150 }, { "epoch": 3.6887958115183244, "grad_norm": 3.5062992572784424, "learning_rate": 9.802e-06, "loss": 0.4675, "step": 2200 }, { "epoch": 3.772565445026178, "grad_norm": 2.250505208969116, "learning_rate": 9.752e-06, "loss": 0.4611, "step": 2250 }, { "epoch": 3.8563350785340313, "grad_norm": 1.793270468711853, "learning_rate": 9.702e-06, "loss": 0.4605, "step": 2300 }, { "epoch": 3.940104712041885, "grad_norm": 1.663677453994751, "learning_rate": 9.652e-06, "loss": 0.4693, "step": 2350 }, { "epoch": 4.023874345549738, "grad_norm": 2.1321282386779785, "learning_rate": 9.602e-06, "loss": 0.4587, "step": 2400 }, { "epoch": 4.107643979057592, "grad_norm": 1.7361410856246948, "learning_rate": 9.552000000000001e-06, "loss": 0.4611, "step": 2450 }, { "epoch": 4.191413612565445, "grad_norm": 2.167386770248413, "learning_rate": 9.502000000000002e-06, "loss": 0.455, "step": 2500 }, { "epoch": 4.191413612565445, "eval_loss": 0.40905508399009705, "eval_runtime": 269.8231, "eval_samples_per_second": 31.461, "eval_steps_per_second": 3.936, "step": 2500 }, { "epoch": 4.275183246073299, "grad_norm": 1.430746078491211, "learning_rate": 9.452000000000002e-06, "loss": 0.4581, "step": 2550 }, { "epoch": 4.358952879581151, "grad_norm": 2.1168527603149414, "learning_rate": 9.402e-06, "loss": 0.4516, "step": 2600 }, { "epoch": 4.442722513089005, "grad_norm": 2.1330721378326416, "learning_rate": 9.353000000000002e-06, "loss": 0.4505, "step": 2650 }, { "epoch": 4.526492146596858, "grad_norm": 1.274557113647461, "learning_rate": 9.303e-06, "loss": 0.4575, "step": 2700 }, { "epoch": 4.610261780104712, "grad_norm": 1.835204839706421, "learning_rate": 9.253000000000001e-06, "loss": 0.4491, "step": 2750 }, { "epoch": 4.694031413612565, "grad_norm": 2.0255746841430664, "learning_rate": 9.203000000000002e-06, "loss": 0.4484, "step": 2800 }, { "epoch": 4.777801047120419, "grad_norm": 2.4793522357940674, "learning_rate": 9.153e-06, "loss": 0.4489, "step": 2850 }, { "epoch": 4.861570680628272, "grad_norm": 1.7192201614379883, "learning_rate": 9.103e-06, "loss": 0.4502, "step": 2900 }, { "epoch": 4.945340314136126, "grad_norm": 1.9378846883773804, "learning_rate": 9.053000000000001e-06, "loss": 0.4483, "step": 2950 }, { "epoch": 5.029109947643979, "grad_norm": 2.4576103687286377, "learning_rate": 9.003e-06, "loss": 0.4474, "step": 3000 }, { "epoch": 5.029109947643979, "eval_loss": 0.39903518557548523, "eval_runtime": 269.2955, "eval_samples_per_second": 31.523, "eval_steps_per_second": 3.944, "step": 3000 } ], "logging_steps": 50, "max_steps": 12000, "num_input_tokens_seen": 0, "num_train_epochs": 21, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.379798854813645e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }