{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 29280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17, "grad_norm": 1.2632079124450684, "learning_rate": 6.25e-05, "loss": 6.1591, "step": 500 }, { "epoch": 0.34, "grad_norm": 1.0422550439834595, "learning_rate": 0.000125, "loss": 3.6369, "step": 1000 }, { "epoch": 0.51, "grad_norm": 0.9257670044898987, "learning_rate": 0.0001875, "loss": 3.3421, "step": 1500 }, { "epoch": 0.68, "grad_norm": 0.8972496390342712, "learning_rate": 0.00025, "loss": 3.1677, "step": 2000 }, { "epoch": 0.85, "grad_norm": 0.891448974609375, "learning_rate": 0.0003125, "loss": 3.0423, "step": 2500 }, { "epoch": 1.0, "eval_accuracy": 0.4385253446525597, "eval_loss": 3.011591911315918, "eval_runtime": 3.3039, "eval_samples_per_second": 1359.31, "eval_steps_per_second": 10.896, "step": 2928 }, { "epoch": 1.02, "grad_norm": 0.8652610778808594, "learning_rate": 0.000375, "loss": 2.9336, "step": 3000 }, { "epoch": 1.2, "grad_norm": 0.8467530012130737, "learning_rate": 0.00043750000000000006, "loss": 2.8609, "step": 3500 }, { "epoch": 1.37, "grad_norm": 0.823049783706665, "learning_rate": 0.0005, "loss": 2.812, "step": 4000 }, { "epoch": 1.54, "grad_norm": 0.8464782238006592, "learning_rate": 0.0005625000000000001, "loss": 2.7718, "step": 4500 }, { "epoch": 1.71, "grad_norm": 0.8122602701187134, "learning_rate": 0.000625, "loss": 2.7306, "step": 5000 }, { "epoch": 1.88, "grad_norm": 0.802824079990387, "learning_rate": 0.0006875, "loss": 2.712, "step": 5500 }, { "epoch": 2.0, "eval_accuracy": 0.45938947010381215, "eval_loss": 2.786367893218994, "eval_runtime": 3.3754, "eval_samples_per_second": 1330.491, "eval_steps_per_second": 10.665, "step": 5856 }, { "epoch": 2.05, "grad_norm": 0.8417896032333374, "learning_rate": 0.00075, "loss": 2.6766, "step": 6000 }, { "epoch": 2.22, "grad_norm": 0.7684197425842285, "learning_rate": 0.0008125, "loss": 2.6201, "step": 6500 }, { "epoch": 2.39, "grad_norm": 0.678827166557312, "learning_rate": 0.0008750000000000001, "loss": 2.6145, "step": 7000 }, { "epoch": 2.56, "grad_norm": 0.7093390822410583, "learning_rate": 0.0009375, "loss": 2.6112, "step": 7500 }, { "epoch": 2.73, "grad_norm": 0.6050966382026672, "learning_rate": 0.001, "loss": 2.5901, "step": 8000 }, { "epoch": 2.9, "grad_norm": 0.6185945272445679, "learning_rate": 0.0010625, "loss": 2.5896, "step": 8500 }, { "epoch": 3.0, "eval_accuracy": 0.469654970483399, "eval_loss": 2.6919634342193604, "eval_runtime": 3.3644, "eval_samples_per_second": 1334.861, "eval_steps_per_second": 10.7, "step": 8784 }, { "epoch": 3.07, "grad_norm": 0.5936027765274048, "learning_rate": 0.0011250000000000001, "loss": 2.5412, "step": 9000 }, { "epoch": 3.24, "grad_norm": 0.6204653978347778, "learning_rate": 0.0011875, "loss": 2.5202, "step": 9500 }, { "epoch": 3.42, "grad_norm": 0.537083625793457, "learning_rate": 0.00125, "loss": 2.511, "step": 10000 }, { "epoch": 3.59, "grad_norm": 0.5039043426513672, "learning_rate": 0.0013125, "loss": 2.5106, "step": 10500 }, { "epoch": 3.76, "grad_norm": 0.5108810067176819, "learning_rate": 0.001375, "loss": 2.5283, "step": 11000 }, { "epoch": 3.93, "grad_norm": 0.4879595935344696, "learning_rate": 0.0014375000000000002, "loss": 2.5169, "step": 11500 }, { "epoch": 4.0, "eval_accuracy": 0.4748219097863268, "eval_loss": 2.643162250518799, "eval_runtime": 3.363, "eval_samples_per_second": 1335.399, "eval_steps_per_second": 10.705, "step": 11712 }, { "epoch": 4.1, "grad_norm": 0.45613226294517517, "learning_rate": 0.0015, "loss": 2.4625, "step": 12000 }, { "epoch": 4.27, "grad_norm": 0.43395110964775085, "learning_rate": 0.0015625, "loss": 2.4584, "step": 12500 }, { "epoch": 4.44, "grad_norm": 0.4137372374534607, "learning_rate": 0.001625, "loss": 2.4641, "step": 13000 }, { "epoch": 4.61, "grad_norm": 0.43818387389183044, "learning_rate": 0.0016875, "loss": 2.4658, "step": 13500 }, { "epoch": 4.78, "grad_norm": 0.4149482846260071, "learning_rate": 0.0017500000000000003, "loss": 2.462, "step": 14000 }, { "epoch": 4.95, "grad_norm": 0.37128859758377075, "learning_rate": 0.0018124999999999999, "loss": 2.4655, "step": 14500 }, { "epoch": 5.0, "eval_accuracy": 0.48043243091607535, "eval_loss": 2.604926347732544, "eval_runtime": 3.381, "eval_samples_per_second": 1328.318, "eval_steps_per_second": 10.648, "step": 14640 }, { "epoch": 5.12, "grad_norm": 0.40244346857070923, "learning_rate": 0.001875, "loss": 2.398, "step": 15000 }, { "epoch": 5.29, "grad_norm": 0.4036855399608612, "learning_rate": 0.0019375000000000002, "loss": 2.4045, "step": 15500 }, { "epoch": 5.46, "grad_norm": 0.3680976927280426, "learning_rate": 0.002, "loss": 2.4195, "step": 16000 }, { "epoch": 5.64, "grad_norm": 0.2993474304676056, "learning_rate": 0.0020625, "loss": 2.4271, "step": 16500 }, { "epoch": 5.81, "grad_norm": 0.394279807806015, "learning_rate": 0.002125, "loss": 2.426, "step": 17000 }, { "epoch": 5.98, "grad_norm": 0.372591495513916, "learning_rate": 0.0021874999999999998, "loss": 2.4298, "step": 17500 }, { "epoch": 6.0, "eval_accuracy": 0.4831114547555303, "eval_loss": 2.5832390785217285, "eval_runtime": 3.3575, "eval_samples_per_second": 1337.597, "eval_steps_per_second": 10.722, "step": 17568 }, { "epoch": 6.15, "grad_norm": 0.36848798394203186, "learning_rate": 0.0022500000000000003, "loss": 2.3586, "step": 18000 }, { "epoch": 6.32, "grad_norm": 0.36105024814605713, "learning_rate": 0.0023125000000000003, "loss": 2.358, "step": 18500 }, { "epoch": 6.49, "grad_norm": 0.3024386763572693, "learning_rate": 0.002375, "loss": 2.3776, "step": 19000 }, { "epoch": 6.66, "grad_norm": 0.3114764094352722, "learning_rate": 0.0024375, "loss": 2.3966, "step": 19500 }, { "epoch": 6.83, "grad_norm": 0.35501497983932495, "learning_rate": 0.0025, "loss": 2.3975, "step": 20000 }, { "epoch": 7.0, "eval_accuracy": 0.48376893770042273, "eval_loss": 2.5711376667022705, "eval_runtime": 3.3747, "eval_samples_per_second": 1330.765, "eval_steps_per_second": 10.667, "step": 20496 }, { "epoch": 7.0, "grad_norm": 0.2878439426422119, "learning_rate": 0.0025625, "loss": 2.4054, "step": 20500 }, { "epoch": 7.17, "grad_norm": 0.31277361512184143, "learning_rate": 0.002625, "loss": 2.322, "step": 21000 }, { "epoch": 7.34, "grad_norm": 0.35922256112098694, "learning_rate": 0.0026875000000000002, "loss": 2.3412, "step": 21500 }, { "epoch": 7.51, "grad_norm": 0.29865553975105286, "learning_rate": 0.00275, "loss": 2.3535, "step": 22000 }, { "epoch": 7.68, "grad_norm": 0.275280237197876, "learning_rate": 0.0028125, "loss": 2.3584, "step": 22500 }, { "epoch": 7.86, "grad_norm": 0.28225448727607727, "learning_rate": 0.0028750000000000004, "loss": 2.3628, "step": 23000 }, { "epoch": 8.0, "eval_accuracy": 0.48590794888113936, "eval_loss": 2.5585122108459473, "eval_runtime": 3.3611, "eval_samples_per_second": 1336.181, "eval_steps_per_second": 10.711, "step": 23424 }, { "epoch": 8.03, "grad_norm": 0.3388291001319885, "learning_rate": 0.0029375, "loss": 2.3561, "step": 23500 }, { "epoch": 8.2, "grad_norm": 0.2987268567085266, "learning_rate": 0.003, "loss": 2.2985, "step": 24000 }, { "epoch": 8.37, "grad_norm": 0.3162915110588074, "learning_rate": 0.002715909090909091, "loss": 2.3115, "step": 24500 }, { "epoch": 8.54, "grad_norm": 0.32822996377944946, "learning_rate": 0.0024318181818181817, "loss": 2.3054, "step": 25000 }, { "epoch": 8.71, "grad_norm": 0.3083394467830658, "learning_rate": 0.002147727272727273, "loss": 2.2944, "step": 25500 }, { "epoch": 8.88, "grad_norm": 0.24597662687301636, "learning_rate": 0.0018636363636363638, "loss": 2.2882, "step": 26000 }, { "epoch": 9.0, "eval_accuracy": 0.4934400734978268, "eval_loss": 2.5040788650512695, "eval_runtime": 3.3615, "eval_samples_per_second": 1336.001, "eval_steps_per_second": 10.709, "step": 26352 }, { "epoch": 9.05, "grad_norm": 0.3208218514919281, "learning_rate": 0.0015795454545454546, "loss": 2.2569, "step": 26500 }, { "epoch": 9.22, "grad_norm": 0.29910457134246826, "learning_rate": 0.0012954545454545456, "loss": 2.1707, "step": 27000 }, { "epoch": 9.39, "grad_norm": 0.2915799617767334, "learning_rate": 0.0010113636363636364, "loss": 2.1683, "step": 27500 }, { "epoch": 9.56, "grad_norm": 0.33277857303619385, "learning_rate": 0.0007272727272727273, "loss": 2.1691, "step": 28000 }, { "epoch": 9.73, "grad_norm": 0.2690746784210205, "learning_rate": 0.0004431818181818182, "loss": 2.1603, "step": 28500 }, { "epoch": 9.9, "grad_norm": 0.30213144421577454, "learning_rate": 0.0001590909090909091, "loss": 2.144, "step": 29000 }, { "epoch": 10.0, "eval_accuracy": 0.4994556041216291, "eval_loss": 2.47524094581604, "eval_runtime": 3.35, "eval_samples_per_second": 1340.589, "eval_steps_per_second": 10.746, "step": 29280 }, { "epoch": 10.0, "step": 29280, "total_flos": 2273237316403200.0, "train_loss": 2.5690583817945805, "train_runtime": 746.5661, "train_samples_per_second": 627.473, "train_steps_per_second": 39.22 } ], "logging_steps": 500, "max_steps": 29280, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2000, "total_flos": 2273237316403200.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }