{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6, "eval_steps": 50, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 6.656943321228027, "learning_rate": 1.2000000000000002e-06, "loss": 1.6339, "step": 50 }, { "epoch": 0.04, "eval_loss": 0.9032842516899109, "eval_runtime": 1.8285, "eval_samples_per_second": 62.347, "eval_steps_per_second": 3.281, "step": 50 }, { "epoch": 0.08, "grad_norm": 3.039907693862915, "learning_rate": 2.4000000000000003e-06, "loss": 1.2834, "step": 100 }, { "epoch": 0.08, "eval_loss": 0.8288049101829529, "eval_runtime": 1.8279, "eval_samples_per_second": 62.366, "eval_steps_per_second": 3.282, "step": 100 }, { "epoch": 0.12, "grad_norm": 2.98620343208313, "learning_rate": 3.6e-06, "loss": 1.1369, "step": 150 }, { "epoch": 0.12, "eval_loss": 0.8595764636993408, "eval_runtime": 1.8209, "eval_samples_per_second": 62.607, "eval_steps_per_second": 3.295, "step": 150 }, { "epoch": 0.16, "grad_norm": 2.9106926918029785, "learning_rate": 4.800000000000001e-06, "loss": 1.1002, "step": 200 }, { "epoch": 0.16, "eval_loss": 0.8750876784324646, "eval_runtime": 1.8177, "eval_samples_per_second": 62.716, "eval_steps_per_second": 3.301, "step": 200 }, { "epoch": 0.2, "grad_norm": 2.5830564498901367, "learning_rate": 6e-06, "loss": 1.0671, "step": 250 }, { "epoch": 0.2, "eval_loss": 0.8878025412559509, "eval_runtime": 1.8312, "eval_samples_per_second": 62.255, "eval_steps_per_second": 3.277, "step": 250 }, { "epoch": 0.24, "grad_norm": 3.3819241523742676, "learning_rate": 7.2e-06, "loss": 1.0332, "step": 300 }, { "epoch": 0.24, "eval_loss": 0.8920303583145142, "eval_runtime": 1.8258, "eval_samples_per_second": 62.439, "eval_steps_per_second": 3.286, "step": 300 }, { "epoch": 0.28, "grad_norm": 2.588942050933838, "learning_rate": 8.400000000000001e-06, "loss": 1.0199, "step": 350 }, { "epoch": 0.28, "eval_loss": 0.8735177516937256, "eval_runtime": 1.8258, "eval_samples_per_second": 62.438, "eval_steps_per_second": 3.286, "step": 350 }, { "epoch": 0.32, "grad_norm": 2.964590311050415, "learning_rate": 9.600000000000001e-06, "loss": 1.0218, "step": 400 }, { "epoch": 0.32, "eval_loss": 0.8841342329978943, "eval_runtime": 1.8275, "eval_samples_per_second": 62.38, "eval_steps_per_second": 3.283, "step": 400 }, { "epoch": 0.36, "grad_norm": 4.193049907684326, "learning_rate": 1.08e-05, "loss": 0.995, "step": 450 }, { "epoch": 0.36, "eval_loss": 0.8781883120536804, "eval_runtime": 1.8206, "eval_samples_per_second": 62.616, "eval_steps_per_second": 3.296, "step": 450 }, { "epoch": 0.4, "grad_norm": 3.1733250617980957, "learning_rate": 1.2e-05, "loss": 0.9752, "step": 500 }, { "epoch": 0.4, "eval_loss": 0.8925071358680725, "eval_runtime": 1.8212, "eval_samples_per_second": 62.598, "eval_steps_per_second": 3.295, "step": 500 }, { "epoch": 0.44, "grad_norm": 4.2617034912109375, "learning_rate": 1.32e-05, "loss": 1.004, "step": 550 }, { "epoch": 0.44, "eval_loss": 0.8857673406600952, "eval_runtime": 1.8247, "eval_samples_per_second": 62.475, "eval_steps_per_second": 3.288, "step": 550 }, { "epoch": 0.48, "grad_norm": 2.422171115875244, "learning_rate": 1.44e-05, "loss": 1.0094, "step": 600 }, { "epoch": 0.48, "eval_loss": 0.885766327381134, "eval_runtime": 1.8226, "eval_samples_per_second": 62.547, "eval_steps_per_second": 3.292, "step": 600 }, { "epoch": 0.52, "grad_norm": 3.6416265964508057, "learning_rate": 1.56e-05, "loss": 0.9922, "step": 650 }, { "epoch": 0.52, "eval_loss": 0.9024901986122131, "eval_runtime": 1.8283, "eval_samples_per_second": 62.353, "eval_steps_per_second": 3.282, "step": 650 }, { "epoch": 0.56, "grad_norm": 3.2401561737060547, "learning_rate": 1.6800000000000002e-05, "loss": 0.9848, "step": 700 }, { "epoch": 0.56, "eval_loss": 0.8976176977157593, "eval_runtime": 1.8213, "eval_samples_per_second": 62.593, "eval_steps_per_second": 3.294, "step": 700 }, { "epoch": 0.6, "grad_norm": 2.61985182762146, "learning_rate": 1.8e-05, "loss": 0.9916, "step": 750 }, { "epoch": 0.6, "eval_loss": 0.873446524143219, "eval_runtime": 1.8236, "eval_samples_per_second": 62.515, "eval_steps_per_second": 3.29, "step": 750 }, { "epoch": 0.64, "grad_norm": 3.0905354022979736, "learning_rate": 1.9200000000000003e-05, "loss": 0.9961, "step": 800 }, { "epoch": 0.64, "eval_loss": 0.8803545236587524, "eval_runtime": 1.8244, "eval_samples_per_second": 62.488, "eval_steps_per_second": 3.289, "step": 800 }, { "epoch": 0.68, "grad_norm": 3.7789053916931152, "learning_rate": 2.04e-05, "loss": 1.02, "step": 850 }, { "epoch": 0.68, "eval_loss": 0.8905940651893616, "eval_runtime": 1.827, "eval_samples_per_second": 62.399, "eval_steps_per_second": 3.284, "step": 850 }, { "epoch": 0.72, "grad_norm": 2.842257261276245, "learning_rate": 2.16e-05, "loss": 1.001, "step": 900 }, { "epoch": 0.72, "eval_loss": 0.8835523128509521, "eval_runtime": 1.8313, "eval_samples_per_second": 62.251, "eval_steps_per_second": 3.276, "step": 900 }, { "epoch": 0.76, "grad_norm": 3.3730099201202393, "learning_rate": 2.2800000000000002e-05, "loss": 1.015, "step": 950 }, { "epoch": 0.76, "eval_loss": 0.953080952167511, "eval_runtime": 1.8303, "eval_samples_per_second": 62.285, "eval_steps_per_second": 3.278, "step": 950 }, { "epoch": 0.8, "grad_norm": 2.283881664276123, "learning_rate": 2.4e-05, "loss": 1.0089, "step": 1000 }, { "epoch": 0.8, "eval_loss": 0.9305523037910461, "eval_runtime": 1.8222, "eval_samples_per_second": 62.56, "eval_steps_per_second": 3.293, "step": 1000 }, { "epoch": 0.84, "grad_norm": 3.0285823345184326, "learning_rate": 2.52e-05, "loss": 1.0215, "step": 1050 }, { "epoch": 0.84, "eval_loss": 0.9584159255027771, "eval_runtime": 1.8216, "eval_samples_per_second": 62.584, "eval_steps_per_second": 3.294, "step": 1050 }, { "epoch": 0.88, "grad_norm": 2.6454532146453857, "learning_rate": 2.64e-05, "loss": 1.0535, "step": 1100 }, { "epoch": 0.88, "eval_loss": 0.9347971677780151, "eval_runtime": 1.8265, "eval_samples_per_second": 62.414, "eval_steps_per_second": 3.285, "step": 1100 }, { "epoch": 0.92, "grad_norm": 2.3046398162841797, "learning_rate": 2.7600000000000003e-05, "loss": 1.036, "step": 1150 }, { "epoch": 0.92, "eval_loss": 0.9546946883201599, "eval_runtime": 1.8227, "eval_samples_per_second": 62.545, "eval_steps_per_second": 3.292, "step": 1150 }, { "epoch": 0.96, "grad_norm": 3.419645309448242, "learning_rate": 2.88e-05, "loss": 1.0732, "step": 1200 }, { "epoch": 0.96, "eval_loss": 0.9314932823181152, "eval_runtime": 1.8207, "eval_samples_per_second": 62.613, "eval_steps_per_second": 3.295, "step": 1200 }, { "epoch": 1.0, "grad_norm": 2.927753210067749, "learning_rate": 3e-05, "loss": 1.0643, "step": 1250 }, { "epoch": 1.0, "eval_loss": 0.9249414801597595, "eval_runtime": 1.8261, "eval_samples_per_second": 62.427, "eval_steps_per_second": 3.286, "step": 1250 }, { "epoch": 1.04, "grad_norm": 2.8154520988464355, "learning_rate": 2.9998537860139564e-05, "loss": 0.6328, "step": 1300 }, { "epoch": 1.04, "eval_loss": 1.008348822593689, "eval_runtime": 1.8294, "eval_samples_per_second": 62.316, "eval_steps_per_second": 3.28, "step": 1300 }, { "epoch": 1.08, "grad_norm": 6.605989456176758, "learning_rate": 2.9994151725605313e-05, "loss": 0.708, "step": 1350 }, { "epoch": 1.08, "eval_loss": 0.9453077912330627, "eval_runtime": 1.8194, "eval_samples_per_second": 62.657, "eval_steps_per_second": 3.298, "step": 1350 }, { "epoch": 1.12, "grad_norm": 2.298051357269287, "learning_rate": 2.9986842451482876e-05, "loss": 0.6654, "step": 1400 }, { "epoch": 1.12, "eval_loss": 1.035285234451294, "eval_runtime": 1.8281, "eval_samples_per_second": 62.359, "eval_steps_per_second": 3.282, "step": 1400 }, { "epoch": 1.16, "grad_norm": 3.0220792293548584, "learning_rate": 2.9976611462729715e-05, "loss": 0.7063, "step": 1450 }, { "epoch": 1.16, "eval_loss": 1.0368359088897705, "eval_runtime": 1.8316, "eval_samples_per_second": 62.24, "eval_steps_per_second": 3.276, "step": 1450 }, { "epoch": 1.2, "grad_norm": 2.9698407649993896, "learning_rate": 2.9963460753897364e-05, "loss": 0.6812, "step": 1500 }, { "epoch": 1.2, "eval_loss": 1.0127767324447632, "eval_runtime": 1.8272, "eval_samples_per_second": 62.392, "eval_steps_per_second": 3.284, "step": 1500 }, { "epoch": 1.24, "grad_norm": 2.5555665493011475, "learning_rate": 2.9947392888742566e-05, "loss": 0.7029, "step": 1550 }, { "epoch": 1.24, "eval_loss": 1.045531153678894, "eval_runtime": 1.8258, "eval_samples_per_second": 62.438, "eval_steps_per_second": 3.286, "step": 1550 }, { "epoch": 1.28, "grad_norm": 2.3660807609558105, "learning_rate": 2.992841099972747e-05, "loss": 0.6799, "step": 1600 }, { "epoch": 1.28, "eval_loss": 1.0579793453216553, "eval_runtime": 1.8239, "eval_samples_per_second": 62.504, "eval_steps_per_second": 3.29, "step": 1600 }, { "epoch": 1.32, "grad_norm": 2.915484666824341, "learning_rate": 2.9906518787408948e-05, "loss": 0.7182, "step": 1650 }, { "epoch": 1.32, "eval_loss": 1.061601161956787, "eval_runtime": 1.8217, "eval_samples_per_second": 62.581, "eval_steps_per_second": 3.294, "step": 1650 }, { "epoch": 1.3599999999999999, "grad_norm": 2.7276549339294434, "learning_rate": 2.988172051971717e-05, "loss": 0.7168, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_loss": 1.0619702339172363, "eval_runtime": 1.8419, "eval_samples_per_second": 61.891, "eval_steps_per_second": 3.257, "step": 1700 }, { "epoch": 1.4, "grad_norm": 2.576150894165039, "learning_rate": 2.9854021031123555e-05, "loss": 0.729, "step": 1750 }, { "epoch": 1.4, "eval_loss": 1.0389442443847656, "eval_runtime": 1.8255, "eval_samples_per_second": 62.447, "eval_steps_per_second": 3.287, "step": 1750 }, { "epoch": 1.44, "grad_norm": 2.996159553527832, "learning_rate": 2.9823425721698293e-05, "loss": 0.7319, "step": 1800 }, { "epoch": 1.44, "eval_loss": 1.0429059267044067, "eval_runtime": 1.8338, "eval_samples_per_second": 62.167, "eval_steps_per_second": 3.272, "step": 1800 }, { "epoch": 1.48, "grad_norm": 3.9739811420440674, "learning_rate": 2.9789940556057574e-05, "loss": 0.7281, "step": 1850 }, { "epoch": 1.48, "eval_loss": 1.0586906671524048, "eval_runtime": 1.8323, "eval_samples_per_second": 62.218, "eval_steps_per_second": 3.275, "step": 1850 }, { "epoch": 1.52, "grad_norm": 2.907900810241699, "learning_rate": 2.975357206220079e-05, "loss": 0.743, "step": 1900 }, { "epoch": 1.52, "eval_loss": 1.0349457263946533, "eval_runtime": 1.8285, "eval_samples_per_second": 62.345, "eval_steps_per_second": 3.281, "step": 1900 }, { "epoch": 1.56, "grad_norm": 3.018134117126465, "learning_rate": 2.9714327330237873e-05, "loss": 0.7251, "step": 1950 }, { "epoch": 1.56, "eval_loss": 1.0124412775039673, "eval_runtime": 1.8281, "eval_samples_per_second": 62.359, "eval_steps_per_second": 3.282, "step": 1950 }, { "epoch": 1.6, "grad_norm": 2.7769956588745117, "learning_rate": 2.9672214011007087e-05, "loss": 0.7403, "step": 2000 }, { "epoch": 1.6, "eval_loss": 1.0399030447006226, "eval_runtime": 1.8304, "eval_samples_per_second": 62.282, "eval_steps_per_second": 3.278, "step": 2000 } ], "logging_steps": 50, "max_steps": 12500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2000, "total_flos": 1.081999679773737e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }