{ "best_metric": 3.699657678604126, "best_model_checkpoint": "/scratch/ka2773/project/lm-mem/checkpoints/gpt2_40m_12-768-1024_a_02/checkpoint-31000", "epoch": 8.481532147742818, "global_step": 31000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14, "learning_rate": 6e-05, "loss": 6.6803, "step": 500 }, { "epoch": 0.14, "eval_loss": 6.017919540405273, "eval_runtime": 9.4636, "eval_samples_per_second": 24.938, "step": 500 }, { "epoch": 0.27, "learning_rate": 6e-05, "loss": 6.0117, "step": 1000 }, { "epoch": 0.27, "eval_loss": 5.752170085906982, "eval_runtime": 9.4619, "eval_samples_per_second": 24.942, "step": 1000 }, { "epoch": 0.41, "learning_rate": 6e-05, "loss": 5.8304, "step": 1500 }, { "epoch": 0.41, "eval_loss": 5.598978519439697, "eval_runtime": 9.4601, "eval_samples_per_second": 24.947, "step": 1500 }, { "epoch": 0.55, "learning_rate": 6e-05, "loss": 5.6932, "step": 2000 }, { "epoch": 0.55, "eval_loss": 5.4606218338012695, "eval_runtime": 9.4514, "eval_samples_per_second": 24.97, "step": 2000 }, { "epoch": 0.68, "learning_rate": 6e-05, "loss": 5.5582, "step": 2500 }, { "epoch": 0.68, "eval_loss": 5.324854373931885, "eval_runtime": 9.4555, "eval_samples_per_second": 24.959, "step": 2500 }, { "epoch": 0.82, "learning_rate": 6e-05, "loss": 5.4194, "step": 3000 }, { "epoch": 0.82, "eval_loss": 5.187784194946289, "eval_runtime": 9.4406, "eval_samples_per_second": 24.998, "step": 3000 }, { "epoch": 0.96, "learning_rate": 6e-05, "loss": 5.2874, "step": 3500 }, { "epoch": 0.96, "eval_loss": 5.043916702270508, "eval_runtime": 9.4594, "eval_samples_per_second": 24.949, "step": 3500 }, { "epoch": 1.09, "learning_rate": 6e-05, "loss": 5.1502, "step": 4000 }, { "epoch": 1.09, "eval_loss": 4.9315690994262695, "eval_runtime": 9.458, "eval_samples_per_second": 24.952, "step": 4000 }, { "epoch": 1.23, "learning_rate": 6e-05, "loss": 5.0351, "step": 4500 }, { "epoch": 1.23, "eval_loss": 4.828794479370117, "eval_runtime": 9.4563, "eval_samples_per_second": 24.957, "step": 4500 }, { "epoch": 1.37, "learning_rate": 6e-05, "loss": 4.9495, "step": 5000 }, { "epoch": 1.37, "eval_loss": 4.735019683837891, "eval_runtime": 9.4648, "eval_samples_per_second": 24.934, "step": 5000 }, { "epoch": 1.5, "learning_rate": 6e-05, "loss": 4.8715, "step": 5500 }, { "epoch": 1.5, "eval_loss": 4.6624579429626465, "eval_runtime": 9.4463, "eval_samples_per_second": 24.983, "step": 5500 }, { "epoch": 1.64, "learning_rate": 6e-05, "loss": 4.7962, "step": 6000 }, { "epoch": 1.64, "eval_loss": 4.586391925811768, "eval_runtime": 9.4563, "eval_samples_per_second": 24.957, "step": 6000 }, { "epoch": 1.78, "learning_rate": 6e-05, "loss": 4.7329, "step": 6500 }, { "epoch": 1.78, "eval_loss": 4.520113468170166, "eval_runtime": 9.4613, "eval_samples_per_second": 24.944, "step": 6500 }, { "epoch": 1.92, "learning_rate": 6e-05, "loss": 4.6692, "step": 7000 }, { "epoch": 1.92, "eval_loss": 4.469329357147217, "eval_runtime": 9.4531, "eval_samples_per_second": 24.965, "step": 7000 }, { "epoch": 2.05, "learning_rate": 6e-05, "loss": 4.603, "step": 7500 }, { "epoch": 2.05, "eval_loss": 4.408419132232666, "eval_runtime": 9.4573, "eval_samples_per_second": 24.954, "step": 7500 }, { "epoch": 2.19, "learning_rate": 6e-05, "loss": 4.5274, "step": 8000 }, { "epoch": 2.19, "eval_loss": 4.358486175537109, "eval_runtime": 9.4556, "eval_samples_per_second": 24.959, "step": 8000 }, { "epoch": 2.33, "learning_rate": 6e-05, "loss": 4.482, "step": 8500 }, { "epoch": 2.33, "eval_loss": 4.316709041595459, "eval_runtime": 9.4572, "eval_samples_per_second": 24.954, "step": 8500 }, { "epoch": 2.46, "learning_rate": 6e-05, "loss": 4.4431, "step": 9000 }, { "epoch": 2.46, "eval_loss": 4.261863708496094, "eval_runtime": 9.4644, "eval_samples_per_second": 24.935, "step": 9000 }, { "epoch": 2.6, "learning_rate": 6e-05, "loss": 4.4003, "step": 9500 }, { "epoch": 2.6, "eval_loss": 4.2196125984191895, "eval_runtime": 9.4647, "eval_samples_per_second": 24.935, "step": 9500 }, { "epoch": 2.74, "learning_rate": 6e-05, "loss": 4.3602, "step": 10000 }, { "epoch": 2.74, "eval_loss": 4.1680402755737305, "eval_runtime": 9.4399, "eval_samples_per_second": 25.0, "step": 10000 }, { "epoch": 2.87, "learning_rate": 6e-05, "loss": 4.3072, "step": 10500 }, { "epoch": 2.87, "eval_loss": 4.134276390075684, "eval_runtime": 9.4305, "eval_samples_per_second": 25.025, "step": 10500 }, { "epoch": 3.01, "learning_rate": 6e-05, "loss": 4.2849, "step": 11000 }, { "epoch": 3.01, "eval_loss": 4.093378067016602, "eval_runtime": 9.4676, "eval_samples_per_second": 24.927, "step": 11000 }, { "epoch": 3.15, "learning_rate": 6e-05, "loss": 4.2092, "step": 11500 }, { "epoch": 3.15, "eval_loss": 4.067226886749268, "eval_runtime": 9.4548, "eval_samples_per_second": 24.961, "step": 11500 }, { "epoch": 3.28, "learning_rate": 6e-05, "loss": 4.1923, "step": 12000 }, { "epoch": 3.28, "eval_loss": 4.044227600097656, "eval_runtime": 9.4733, "eval_samples_per_second": 24.912, "step": 12000 }, { "epoch": 3.42, "learning_rate": 6e-05, "loss": 4.1774, "step": 12500 }, { "epoch": 3.42, "eval_loss": 4.027223587036133, "eval_runtime": 9.4672, "eval_samples_per_second": 24.928, "step": 12500 }, { "epoch": 3.56, "learning_rate": 6e-05, "loss": 4.1547, "step": 13000 }, { "epoch": 3.56, "eval_loss": 4.007537841796875, "eval_runtime": 9.4577, "eval_samples_per_second": 24.953, "step": 13000 }, { "epoch": 3.69, "learning_rate": 6e-05, "loss": 4.141, "step": 13500 }, { "epoch": 3.69, "eval_loss": 3.9847309589385986, "eval_runtime": 9.464, "eval_samples_per_second": 24.937, "step": 13500 }, { "epoch": 3.83, "learning_rate": 6e-05, "loss": 4.1375, "step": 14000 }, { "epoch": 3.83, "eval_loss": 3.9671761989593506, "eval_runtime": 9.4732, "eval_samples_per_second": 24.912, "step": 14000 }, { "epoch": 3.97, "learning_rate": 6e-05, "loss": 4.1064, "step": 14500 }, { "epoch": 3.97, "eval_loss": 3.9399986267089844, "eval_runtime": 9.4647, "eval_samples_per_second": 24.935, "step": 14500 }, { "epoch": 4.1, "learning_rate": 6e-05, "loss": 4.0485, "step": 15000 }, { "epoch": 4.1, "eval_loss": 3.932234525680542, "eval_runtime": 9.4672, "eval_samples_per_second": 24.928, "step": 15000 }, { "epoch": 4.24, "learning_rate": 6e-05, "loss": 4.0335, "step": 15500 }, { "epoch": 4.24, "eval_loss": 3.9215309619903564, "eval_runtime": 9.4597, "eval_samples_per_second": 24.948, "step": 15500 }, { "epoch": 4.38, "learning_rate": 6e-05, "loss": 4.0358, "step": 16000 }, { "epoch": 4.38, "eval_loss": 3.898069381713867, "eval_runtime": 9.4635, "eval_samples_per_second": 24.938, "step": 16000 }, { "epoch": 4.51, "learning_rate": 6e-05, "loss": 4.0187, "step": 16500 }, { "epoch": 4.51, "eval_loss": 3.893036127090454, "eval_runtime": 9.4622, "eval_samples_per_second": 24.941, "step": 16500 }, { "epoch": 4.65, "learning_rate": 6e-05, "loss": 4.0188, "step": 17000 }, { "epoch": 4.65, "eval_loss": 3.8782527446746826, "eval_runtime": 9.459, "eval_samples_per_second": 24.95, "step": 17000 }, { "epoch": 4.79, "learning_rate": 6e-05, "loss": 4.0176, "step": 17500 }, { "epoch": 4.79, "eval_loss": 3.8655033111572266, "eval_runtime": 9.4578, "eval_samples_per_second": 24.953, "step": 17500 }, { "epoch": 4.92, "learning_rate": 6e-05, "loss": 4.0025, "step": 18000 }, { "epoch": 4.92, "eval_loss": 3.8568382263183594, "eval_runtime": 9.4581, "eval_samples_per_second": 24.952, "step": 18000 }, { "epoch": 5.06, "learning_rate": 6e-05, "loss": 3.9554, "step": 18500 }, { "epoch": 5.06, "eval_loss": 3.849705696105957, "eval_runtime": 9.4471, "eval_samples_per_second": 24.981, "step": 18500 }, { "epoch": 5.2, "learning_rate": 6e-05, "loss": 3.9252, "step": 19000 }, { "epoch": 5.2, "eval_loss": 3.838930130004883, "eval_runtime": 9.4605, "eval_samples_per_second": 24.946, "step": 19000 }, { "epoch": 5.34, "learning_rate": 6e-05, "loss": 3.9239, "step": 19500 }, { "epoch": 5.34, "eval_loss": 3.8333945274353027, "eval_runtime": 9.462, "eval_samples_per_second": 24.942, "step": 19500 }, { "epoch": 5.47, "learning_rate": 6e-05, "loss": 3.9354, "step": 20000 }, { "epoch": 5.47, "eval_loss": 3.8130805492401123, "eval_runtime": 9.4596, "eval_samples_per_second": 24.948, "step": 20000 }, { "epoch": 5.61, "learning_rate": 6e-05, "loss": 3.9418, "step": 20500 }, { "epoch": 5.61, "eval_loss": 3.8175787925720215, "eval_runtime": 9.4602, "eval_samples_per_second": 24.947, "step": 20500 }, { "epoch": 5.75, "learning_rate": 6e-05, "loss": 3.9291, "step": 21000 }, { "epoch": 5.75, "eval_loss": 3.807772636413574, "eval_runtime": 9.459, "eval_samples_per_second": 24.95, "step": 21000 }, { "epoch": 5.88, "learning_rate": 6e-05, "loss": 3.9309, "step": 21500 }, { "epoch": 5.88, "eval_loss": 3.7906742095947266, "eval_runtime": 9.4675, "eval_samples_per_second": 24.927, "step": 21500 }, { "epoch": 6.02, "learning_rate": 6e-05, "loss": 3.9105, "step": 22000 }, { "epoch": 6.02, "eval_loss": 3.7875397205352783, "eval_runtime": 9.4606, "eval_samples_per_second": 24.946, "step": 22000 }, { "epoch": 6.16, "learning_rate": 6e-05, "loss": 3.8424, "step": 22500 }, { "epoch": 6.16, "eval_loss": 3.782013416290283, "eval_runtime": 9.4639, "eval_samples_per_second": 24.937, "step": 22500 }, { "epoch": 6.29, "learning_rate": 6e-05, "loss": 3.8579, "step": 23000 }, { "epoch": 6.29, "eval_loss": 3.781845808029175, "eval_runtime": 9.4621, "eval_samples_per_second": 24.941, "step": 23000 }, { "epoch": 6.43, "learning_rate": 6e-05, "loss": 3.868, "step": 23500 }, { "epoch": 6.43, "eval_loss": 3.76729679107666, "eval_runtime": 9.4657, "eval_samples_per_second": 24.932, "step": 23500 }, { "epoch": 6.57, "learning_rate": 6e-05, "loss": 3.8631, "step": 24000 }, { "epoch": 6.57, "eval_loss": 3.7602250576019287, "eval_runtime": 9.4649, "eval_samples_per_second": 24.934, "step": 24000 }, { "epoch": 6.7, "learning_rate": 6e-05, "loss": 3.8635, "step": 24500 }, { "epoch": 6.7, "eval_loss": 3.7623238563537598, "eval_runtime": 9.456, "eval_samples_per_second": 24.958, "step": 24500 }, { "epoch": 6.84, "learning_rate": 6e-05, "loss": 3.8632, "step": 25000 }, { "epoch": 6.84, "eval_loss": 3.7607743740081787, "eval_runtime": 9.4601, "eval_samples_per_second": 24.947, "step": 25000 }, { "epoch": 6.98, "learning_rate": 6e-05, "loss": 3.873, "step": 25500 }, { "epoch": 6.98, "eval_loss": 3.749258279800415, "eval_runtime": 9.4598, "eval_samples_per_second": 24.948, "step": 25500 }, { "epoch": 7.11, "learning_rate": 6e-05, "loss": 3.7911, "step": 26000 }, { "epoch": 7.11, "eval_loss": 3.744161367416382, "eval_runtime": 9.4582, "eval_samples_per_second": 24.952, "step": 26000 }, { "epoch": 7.25, "learning_rate": 6e-05, "loss": 3.8039, "step": 26500 }, { "epoch": 7.25, "eval_loss": 3.739082098007202, "eval_runtime": 9.4581, "eval_samples_per_second": 24.952, "step": 26500 }, { "epoch": 7.39, "learning_rate": 6e-05, "loss": 3.7996, "step": 27000 }, { "epoch": 7.39, "eval_loss": 3.738431692123413, "eval_runtime": 9.4714, "eval_samples_per_second": 24.917, "step": 27000 }, { "epoch": 7.52, "learning_rate": 6e-05, "loss": 3.8157, "step": 27500 }, { "epoch": 7.52, "eval_loss": 3.7242279052734375, "eval_runtime": 9.4626, "eval_samples_per_second": 24.94, "step": 27500 }, { "epoch": 7.66, "learning_rate": 6e-05, "loss": 3.8224, "step": 28000 }, { "epoch": 7.66, "eval_loss": 3.7267138957977295, "eval_runtime": 9.4527, "eval_samples_per_second": 24.966, "step": 28000 }, { "epoch": 7.8, "learning_rate": 6e-05, "loss": 3.8246, "step": 28500 }, { "epoch": 7.8, "eval_loss": 3.711819887161255, "eval_runtime": 9.4622, "eval_samples_per_second": 24.941, "step": 28500 }, { "epoch": 7.93, "learning_rate": 6e-05, "loss": 3.8176, "step": 29000 }, { "epoch": 7.93, "eval_loss": 3.707026958465576, "eval_runtime": 9.4595, "eval_samples_per_second": 24.948, "step": 29000 }, { "epoch": 8.07, "learning_rate": 6e-05, "loss": 3.7763, "step": 29500 }, { "epoch": 8.07, "eval_loss": 3.7134788036346436, "eval_runtime": 9.4603, "eval_samples_per_second": 24.946, "step": 29500 }, { "epoch": 8.21, "learning_rate": 6e-05, "loss": 3.7557, "step": 30000 }, { "epoch": 8.21, "eval_loss": 3.7067065238952637, "eval_runtime": 9.4683, "eval_samples_per_second": 24.925, "step": 30000 }, { "epoch": 8.34, "learning_rate": 6e-05, "loss": 3.7662, "step": 30500 }, { "epoch": 8.34, "eval_loss": 3.7023823261260986, "eval_runtime": 9.4321, "eval_samples_per_second": 25.021, "step": 30500 }, { "epoch": 8.48, "learning_rate": 6e-05, "loss": 3.7677, "step": 31000 }, { "epoch": 8.48, "eval_loss": 3.699657678604126, "eval_runtime": 9.4485, "eval_samples_per_second": 24.978, "step": 31000 } ], "max_steps": 36550, "num_train_epochs": 10, "total_flos": 3969647640576000.0, "trial_name": null, "trial_params": null }