{ "best_metric": 1.719967246055603, "best_model_checkpoint": "ckpts/sft_OLMo-1B-hf/checkpoint-940", "epoch": 4.96042216358839, "eval_steps": 20, "global_step": 940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10554089709762533, "grad_norm": 10.3125, "learning_rate": 4.000000000000001e-06, "loss": 2.4799, "step": 20 }, { "epoch": 0.10554089709762533, "eval_loss": 2.3184142112731934, "eval_runtime": 4.0729, "eval_samples_per_second": 49.105, "eval_steps_per_second": 12.276, "step": 20 }, { "epoch": 0.21108179419525067, "grad_norm": 7.46875, "learning_rate": 8.000000000000001e-06, "loss": 2.2916, "step": 40 }, { "epoch": 0.21108179419525067, "eval_loss": 2.265637159347534, "eval_runtime": 4.034, "eval_samples_per_second": 49.579, "eval_steps_per_second": 12.395, "step": 40 }, { "epoch": 0.316622691292876, "grad_norm": 7.1875, "learning_rate": 1e-05, "loss": 2.2737, "step": 60 }, { "epoch": 0.316622691292876, "eval_loss": 2.2540640830993652, "eval_runtime": 3.9951, "eval_samples_per_second": 50.062, "eval_steps_per_second": 12.515, "step": 60 }, { "epoch": 0.42216358839050133, "grad_norm": 7.25, "learning_rate": 1e-05, "loss": 2.1889, "step": 80 }, { "epoch": 0.42216358839050133, "eval_loss": 2.2447378635406494, "eval_runtime": 4.3965, "eval_samples_per_second": 45.491, "eval_steps_per_second": 11.373, "step": 80 }, { "epoch": 0.5277044854881267, "grad_norm": 8.0, "learning_rate": 1e-05, "loss": 2.2005, "step": 100 }, { "epoch": 0.5277044854881267, "eval_loss": 2.225715398788452, "eval_runtime": 4.0633, "eval_samples_per_second": 49.221, "eval_steps_per_second": 12.305, "step": 100 }, { "epoch": 0.633245382585752, "grad_norm": 7.46875, "learning_rate": 1e-05, "loss": 2.1915, "step": 120 }, { "epoch": 0.633245382585752, "eval_loss": 2.208789587020874, "eval_runtime": 4.4371, "eval_samples_per_second": 45.075, "eval_steps_per_second": 11.269, "step": 120 }, { "epoch": 0.7387862796833773, "grad_norm": 7.875, "learning_rate": 1e-05, "loss": 2.2115, "step": 140 }, { "epoch": 0.7387862796833773, "eval_loss": 2.189687728881836, "eval_runtime": 4.427, "eval_samples_per_second": 45.177, "eval_steps_per_second": 11.294, "step": 140 }, { "epoch": 0.8443271767810027, "grad_norm": 7.28125, "learning_rate": 1e-05, "loss": 2.1754, "step": 160 }, { "epoch": 0.8443271767810027, "eval_loss": 2.1662068367004395, "eval_runtime": 4.4352, "eval_samples_per_second": 45.094, "eval_steps_per_second": 11.273, "step": 160 }, { "epoch": 0.9498680738786279, "grad_norm": 7.4375, "learning_rate": 1e-05, "loss": 2.1529, "step": 180 }, { "epoch": 0.9498680738786279, "eval_loss": 2.151796340942383, "eval_runtime": 4.578, "eval_samples_per_second": 43.688, "eval_steps_per_second": 10.922, "step": 180 }, { "epoch": 1.0554089709762533, "grad_norm": 7.4375, "learning_rate": 1e-05, "loss": 2.0596, "step": 200 }, { "epoch": 1.0554089709762533, "eval_loss": 2.150334119796753, "eval_runtime": 4.3064, "eval_samples_per_second": 46.443, "eval_steps_per_second": 11.611, "step": 200 }, { "epoch": 1.1609498680738786, "grad_norm": 8.3125, "learning_rate": 1e-05, "loss": 1.9336, "step": 220 }, { "epoch": 1.1609498680738786, "eval_loss": 2.138848066329956, "eval_runtime": 4.7845, "eval_samples_per_second": 41.802, "eval_steps_per_second": 10.45, "step": 220 }, { "epoch": 1.266490765171504, "grad_norm": 8.125, "learning_rate": 1e-05, "loss": 1.917, "step": 240 }, { "epoch": 1.266490765171504, "eval_loss": 2.1307833194732666, "eval_runtime": 4.549, "eval_samples_per_second": 43.966, "eval_steps_per_second": 10.992, "step": 240 }, { "epoch": 1.3720316622691293, "grad_norm": 8.375, "learning_rate": 1e-05, "loss": 1.9214, "step": 260 }, { "epoch": 1.3720316622691293, "eval_loss": 2.11213755607605, "eval_runtime": 4.4528, "eval_samples_per_second": 44.916, "eval_steps_per_second": 11.229, "step": 260 }, { "epoch": 1.4775725593667546, "grad_norm": 8.25, "learning_rate": 1e-05, "loss": 1.9631, "step": 280 }, { "epoch": 1.4775725593667546, "eval_loss": 2.0881500244140625, "eval_runtime": 4.489, "eval_samples_per_second": 44.553, "eval_steps_per_second": 11.138, "step": 280 }, { "epoch": 1.58311345646438, "grad_norm": 8.5, "learning_rate": 1e-05, "loss": 1.8888, "step": 300 }, { "epoch": 1.58311345646438, "eval_loss": 2.0727522373199463, "eval_runtime": 4.6234, "eval_samples_per_second": 43.258, "eval_steps_per_second": 10.815, "step": 300 }, { "epoch": 1.6886543535620053, "grad_norm": 8.75, "learning_rate": 1e-05, "loss": 1.8634, "step": 320 }, { "epoch": 1.6886543535620053, "eval_loss": 2.0583410263061523, "eval_runtime": 4.3979, "eval_samples_per_second": 45.476, "eval_steps_per_second": 11.369, "step": 320 }, { "epoch": 1.7941952506596306, "grad_norm": 9.625, "learning_rate": 1e-05, "loss": 1.8716, "step": 340 }, { "epoch": 1.7941952506596306, "eval_loss": 2.0440073013305664, "eval_runtime": 4.4336, "eval_samples_per_second": 45.111, "eval_steps_per_second": 11.278, "step": 340 }, { "epoch": 1.899736147757256, "grad_norm": 8.625, "learning_rate": 1e-05, "loss": 1.8626, "step": 360 }, { "epoch": 1.899736147757256, "eval_loss": 2.027642011642456, "eval_runtime": 4.5994, "eval_samples_per_second": 43.484, "eval_steps_per_second": 10.871, "step": 360 }, { "epoch": 2.005277044854881, "grad_norm": 10.0625, "learning_rate": 1e-05, "loss": 1.8374, "step": 380 }, { "epoch": 2.005277044854881, "eval_loss": 2.023581027984619, "eval_runtime": 4.513, "eval_samples_per_second": 44.316, "eval_steps_per_second": 11.079, "step": 380 }, { "epoch": 2.1108179419525066, "grad_norm": 10.3125, "learning_rate": 1e-05, "loss": 1.6156, "step": 400 }, { "epoch": 2.1108179419525066, "eval_loss": 2.034921169281006, "eval_runtime": 4.3548, "eval_samples_per_second": 45.926, "eval_steps_per_second": 11.482, "step": 400 }, { "epoch": 2.216358839050132, "grad_norm": 10.5, "learning_rate": 1e-05, "loss": 1.571, "step": 420 }, { "epoch": 2.216358839050132, "eval_loss": 2.0253899097442627, "eval_runtime": 4.4896, "eval_samples_per_second": 44.547, "eval_steps_per_second": 11.137, "step": 420 }, { "epoch": 2.321899736147757, "grad_norm": 10.9375, "learning_rate": 1e-05, "loss": 1.5824, "step": 440 }, { "epoch": 2.321899736147757, "eval_loss": 2.000455141067505, "eval_runtime": 4.3294, "eval_samples_per_second": 46.195, "eval_steps_per_second": 11.549, "step": 440 }, { "epoch": 2.4274406332453826, "grad_norm": 11.25, "learning_rate": 1e-05, "loss": 1.532, "step": 460 }, { "epoch": 2.4274406332453826, "eval_loss": 2.0012362003326416, "eval_runtime": 4.4248, "eval_samples_per_second": 45.2, "eval_steps_per_second": 11.3, "step": 460 }, { "epoch": 2.532981530343008, "grad_norm": 10.875, "learning_rate": 1e-05, "loss": 1.538, "step": 480 }, { "epoch": 2.532981530343008, "eval_loss": 1.9685580730438232, "eval_runtime": 4.6986, "eval_samples_per_second": 42.566, "eval_steps_per_second": 10.642, "step": 480 }, { "epoch": 2.638522427440633, "grad_norm": 11.4375, "learning_rate": 1e-05, "loss": 1.5482, "step": 500 }, { "epoch": 2.638522427440633, "eval_loss": 1.945511817932129, "eval_runtime": 4.3643, "eval_samples_per_second": 45.826, "eval_steps_per_second": 11.457, "step": 500 }, { "epoch": 2.7440633245382586, "grad_norm": 11.5625, "learning_rate": 1e-05, "loss": 1.5028, "step": 520 }, { "epoch": 2.7440633245382586, "eval_loss": 1.9389147758483887, "eval_runtime": 4.5184, "eval_samples_per_second": 44.263, "eval_steps_per_second": 11.066, "step": 520 }, { "epoch": 2.849604221635884, "grad_norm": 12.5625, "learning_rate": 1e-05, "loss": 1.4947, "step": 540 }, { "epoch": 2.849604221635884, "eval_loss": 1.9430372714996338, "eval_runtime": 4.4992, "eval_samples_per_second": 44.453, "eval_steps_per_second": 11.113, "step": 540 }, { "epoch": 2.955145118733509, "grad_norm": 11.9375, "learning_rate": 1e-05, "loss": 1.5243, "step": 560 }, { "epoch": 2.955145118733509, "eval_loss": 1.9145066738128662, "eval_runtime": 4.4679, "eval_samples_per_second": 44.764, "eval_steps_per_second": 11.191, "step": 560 }, { "epoch": 3.0606860158311346, "grad_norm": 15.0, "learning_rate": 1e-05, "loss": 1.3297, "step": 580 }, { "epoch": 3.0606860158311346, "eval_loss": 1.9249849319458008, "eval_runtime": 4.5014, "eval_samples_per_second": 44.43, "eval_steps_per_second": 11.108, "step": 580 }, { "epoch": 3.16622691292876, "grad_norm": 13.25, "learning_rate": 1e-05, "loss": 1.21, "step": 600 }, { "epoch": 3.16622691292876, "eval_loss": 1.9324084520339966, "eval_runtime": 4.7117, "eval_samples_per_second": 42.447, "eval_steps_per_second": 10.612, "step": 600 }, { "epoch": 3.271767810026385, "grad_norm": 15.875, "learning_rate": 1e-05, "loss": 1.2001, "step": 620 }, { "epoch": 3.271767810026385, "eval_loss": 1.9431959390640259, "eval_runtime": 4.4958, "eval_samples_per_second": 44.486, "eval_steps_per_second": 11.121, "step": 620 }, { "epoch": 3.3773087071240107, "grad_norm": 15.125, "learning_rate": 1e-05, "loss": 1.1686, "step": 640 }, { "epoch": 3.3773087071240107, "eval_loss": 1.9009323120117188, "eval_runtime": 4.3205, "eval_samples_per_second": 46.291, "eval_steps_per_second": 11.573, "step": 640 }, { "epoch": 3.4828496042216357, "grad_norm": 16.5, "learning_rate": 1e-05, "loss": 1.1798, "step": 660 }, { "epoch": 3.4828496042216357, "eval_loss": 1.8920202255249023, "eval_runtime": 4.5772, "eval_samples_per_second": 43.695, "eval_steps_per_second": 10.924, "step": 660 }, { "epoch": 3.588390501319261, "grad_norm": 14.1875, "learning_rate": 1e-05, "loss": 1.197, "step": 680 }, { "epoch": 3.588390501319261, "eval_loss": 1.8691601753234863, "eval_runtime": 4.5889, "eval_samples_per_second": 43.584, "eval_steps_per_second": 10.896, "step": 680 }, { "epoch": 3.6939313984168867, "grad_norm": 15.25, "learning_rate": 1e-05, "loss": 1.1745, "step": 700 }, { "epoch": 3.6939313984168867, "eval_loss": 1.8563519716262817, "eval_runtime": 4.09, "eval_samples_per_second": 48.9, "eval_steps_per_second": 12.225, "step": 700 }, { "epoch": 3.7994722955145117, "grad_norm": 16.125, "learning_rate": 1e-05, "loss": 1.1083, "step": 720 }, { "epoch": 3.7994722955145117, "eval_loss": 1.8388882875442505, "eval_runtime": 4.1971, "eval_samples_per_second": 47.653, "eval_steps_per_second": 11.913, "step": 720 }, { "epoch": 3.905013192612137, "grad_norm": 16.25, "learning_rate": 1e-05, "loss": 1.1325, "step": 740 }, { "epoch": 3.905013192612137, "eval_loss": 1.8317779302597046, "eval_runtime": 4.353, "eval_samples_per_second": 45.945, "eval_steps_per_second": 11.486, "step": 740 }, { "epoch": 4.010554089709762, "grad_norm": 16.25, "learning_rate": 1e-05, "loss": 1.0731, "step": 760 }, { "epoch": 4.010554089709762, "eval_loss": 1.8252906799316406, "eval_runtime": 4.5472, "eval_samples_per_second": 43.983, "eval_steps_per_second": 10.996, "step": 760 }, { "epoch": 4.116094986807388, "grad_norm": 15.75, "learning_rate": 1e-05, "loss": 0.8763, "step": 780 }, { "epoch": 4.116094986807388, "eval_loss": 1.8407686948776245, "eval_runtime": 4.341, "eval_samples_per_second": 46.073, "eval_steps_per_second": 11.518, "step": 780 }, { "epoch": 4.221635883905013, "grad_norm": 19.25, "learning_rate": 1e-05, "loss": 0.8789, "step": 800 }, { "epoch": 4.221635883905013, "eval_loss": 1.836584210395813, "eval_runtime": 4.5537, "eval_samples_per_second": 43.921, "eval_steps_per_second": 10.98, "step": 800 }, { "epoch": 4.327176781002638, "grad_norm": 18.625, "learning_rate": 1e-05, "loss": 0.8585, "step": 820 }, { "epoch": 4.327176781002638, "eval_loss": 1.826670527458191, "eval_runtime": 4.5393, "eval_samples_per_second": 44.06, "eval_steps_per_second": 11.015, "step": 820 }, { "epoch": 4.432717678100264, "grad_norm": 18.0, "learning_rate": 1e-05, "loss": 0.7994, "step": 840 }, { "epoch": 4.432717678100264, "eval_loss": 1.823104977607727, "eval_runtime": 4.5835, "eval_samples_per_second": 43.635, "eval_steps_per_second": 10.909, "step": 840 }, { "epoch": 4.538258575197889, "grad_norm": 17.125, "learning_rate": 1e-05, "loss": 0.828, "step": 860 }, { "epoch": 4.538258575197889, "eval_loss": 1.7835866212844849, "eval_runtime": 4.4222, "eval_samples_per_second": 45.227, "eval_steps_per_second": 11.307, "step": 860 }, { "epoch": 4.643799472295514, "grad_norm": 15.8125, "learning_rate": 1e-05, "loss": 0.8055, "step": 880 }, { "epoch": 4.643799472295514, "eval_loss": 1.776289939880371, "eval_runtime": 4.451, "eval_samples_per_second": 44.934, "eval_steps_per_second": 11.234, "step": 880 }, { "epoch": 4.74934036939314, "grad_norm": 16.125, "learning_rate": 1e-05, "loss": 0.8072, "step": 900 }, { "epoch": 4.74934036939314, "eval_loss": 1.7724196910858154, "eval_runtime": 4.6227, "eval_samples_per_second": 43.264, "eval_steps_per_second": 10.816, "step": 900 }, { "epoch": 4.854881266490765, "grad_norm": 15.1875, "learning_rate": 1e-05, "loss": 0.8029, "step": 920 }, { "epoch": 4.854881266490765, "eval_loss": 1.7451767921447754, "eval_runtime": 4.5459, "eval_samples_per_second": 43.995, "eval_steps_per_second": 10.999, "step": 920 }, { "epoch": 4.96042216358839, "grad_norm": 19.125, "learning_rate": 1e-05, "loss": 0.7929, "step": 940 }, { "epoch": 4.96042216358839, "eval_loss": 1.719967246055603, "eval_runtime": 4.7451, "eval_samples_per_second": 42.148, "eval_steps_per_second": 10.537, "step": 940 } ], "logging_steps": 20, "max_steps": 9450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 20, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.968166912425984e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }