{ "best_metric": 1.054961085319519, "best_model_checkpoint": "/kaggle/output/checkpoint-28000", "epoch": 1.1408083441981747, "eval_steps": 1000, "global_step": 28000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.7777777777777777e-11, "loss": 1.2184, "step": 1 }, { "epoch": 0.04, "learning_rate": 2.7750000000000004e-08, "loss": 1.1394, "step": 1000 }, { "epoch": 0.04, "eval_accuracy": 0.3327345309381238, "eval_loss": 1.1149410009384155, "eval_runtime": 20.6803, "eval_samples_per_second": 242.26, "eval_steps_per_second": 30.319, "step": 1000 }, { "epoch": 0.08, "learning_rate": 5.5527777777777784e-08, "loss": 1.1141, "step": 2000 }, { "epoch": 0.08, "eval_accuracy": 0.3401197604790419, "eval_loss": 1.104099988937378, "eval_runtime": 20.8477, "eval_samples_per_second": 240.314, "eval_steps_per_second": 30.075, "step": 2000 }, { "epoch": 0.12, "learning_rate": 8.330555555555556e-08, "loss": 1.116, "step": 3000 }, { "epoch": 0.12, "eval_accuracy": 0.3407185628742515, "eval_loss": 1.1040862798690796, "eval_runtime": 20.6818, "eval_samples_per_second": 242.242, "eval_steps_per_second": 30.317, "step": 3000 }, { "epoch": 0.16, "learning_rate": 1.1108333333333333e-07, "loss": 1.1158, "step": 4000 }, { "epoch": 0.16, "eval_accuracy": 0.32894211576846305, "eval_loss": 1.1020556688308716, "eval_runtime": 20.8541, "eval_samples_per_second": 240.241, "eval_steps_per_second": 30.066, "step": 4000 }, { "epoch": 0.2, "learning_rate": 1.3883333333333335e-07, "loss": 1.1135, "step": 5000 }, { "epoch": 0.2, "eval_accuracy": 0.34271457085828344, "eval_loss": 1.1008552312850952, "eval_runtime": 20.8055, "eval_samples_per_second": 240.802, "eval_steps_per_second": 30.136, "step": 5000 }, { "epoch": 0.24, "learning_rate": 1.6658333333333335e-07, "loss": 1.1121, "step": 6000 }, { "epoch": 0.24, "eval_accuracy": 0.3395209580838323, "eval_loss": 1.1004050970077515, "eval_runtime": 20.8985, "eval_samples_per_second": 239.731, "eval_steps_per_second": 30.002, "step": 6000 }, { "epoch": 0.29, "learning_rate": 1.9436111111111112e-07, "loss": 1.1089, "step": 7000 }, { "epoch": 0.29, "eval_accuracy": 0.35788423153692617, "eval_loss": 1.0985721349716187, "eval_runtime": 20.84, "eval_samples_per_second": 240.403, "eval_steps_per_second": 30.086, "step": 7000 }, { "epoch": 0.33, "learning_rate": 2.2213888888888891e-07, "loss": 1.1079, "step": 8000 }, { "epoch": 0.33, "eval_accuracy": 0.3331337325349301, "eval_loss": 1.098374843597412, "eval_runtime": 20.7886, "eval_samples_per_second": 240.998, "eval_steps_per_second": 30.161, "step": 8000 }, { "epoch": 0.37, "learning_rate": 2.4988888888888893e-07, "loss": 1.1087, "step": 9000 }, { "epoch": 0.37, "eval_accuracy": 0.34510978043912177, "eval_loss": 1.0993521213531494, "eval_runtime": 20.782, "eval_samples_per_second": 241.074, "eval_steps_per_second": 30.17, "step": 9000 }, { "epoch": 0.41, "learning_rate": 2.776666666666667e-07, "loss": 1.109, "step": 10000 }, { "epoch": 0.41, "eval_accuracy": 0.3475049900199601, "eval_loss": 1.0967597961425781, "eval_runtime": 20.6798, "eval_samples_per_second": 242.265, "eval_steps_per_second": 30.319, "step": 10000 }, { "epoch": 0.45, "learning_rate": 3.054444444444444e-07, "loss": 1.1052, "step": 11000 }, { "epoch": 0.45, "eval_accuracy": 0.37544910179640717, "eval_loss": 1.0941349267959595, "eval_runtime": 20.8641, "eval_samples_per_second": 240.126, "eval_steps_per_second": 30.052, "step": 11000 }, { "epoch": 0.49, "learning_rate": 3.3322222222222225e-07, "loss": 1.105, "step": 12000 }, { "epoch": 0.49, "eval_accuracy": 0.3834331337325349, "eval_loss": 1.0927647352218628, "eval_runtime": 20.6541, "eval_samples_per_second": 242.567, "eval_steps_per_second": 30.357, "step": 12000 }, { "epoch": 0.53, "learning_rate": 3.609722222222222e-07, "loss": 1.1016, "step": 13000 }, { "epoch": 0.53, "eval_accuracy": 0.3457085828343313, "eval_loss": 1.0942081212997437, "eval_runtime": 21.0733, "eval_samples_per_second": 237.742, "eval_steps_per_second": 29.753, "step": 13000 }, { "epoch": 0.57, "learning_rate": 3.8875e-07, "loss": 1.1031, "step": 14000 }, { "epoch": 0.57, "eval_accuracy": 0.37005988023952097, "eval_loss": 1.0918152332305908, "eval_runtime": 20.9151, "eval_samples_per_second": 239.54, "eval_steps_per_second": 29.978, "step": 14000 }, { "epoch": 0.61, "learning_rate": 4.1652777777777786e-07, "loss": 1.1026, "step": 15000 }, { "epoch": 0.61, "eval_accuracy": 0.3790419161676647, "eval_loss": 1.0895211696624756, "eval_runtime": 21.0591, "eval_samples_per_second": 237.902, "eval_steps_per_second": 29.773, "step": 15000 }, { "epoch": 0.65, "learning_rate": 4.4427777777777783e-07, "loss": 1.0988, "step": 16000 }, { "epoch": 0.65, "eval_accuracy": 0.4101796407185629, "eval_loss": 1.0852997303009033, "eval_runtime": 20.9509, "eval_samples_per_second": 239.131, "eval_steps_per_second": 29.927, "step": 16000 }, { "epoch": 0.69, "learning_rate": 4.720555555555556e-07, "loss": 1.0974, "step": 17000 }, { "epoch": 0.69, "eval_accuracy": 0.43213572854291415, "eval_loss": 1.0791982412338257, "eval_runtime": 20.7526, "eval_samples_per_second": 241.415, "eval_steps_per_second": 30.213, "step": 17000 }, { "epoch": 0.73, "learning_rate": 4.998055555555556e-07, "loss": 1.0932, "step": 18000 }, { "epoch": 0.73, "eval_accuracy": 0.4275449101796407, "eval_loss": 1.072191596031189, "eval_runtime": 21.2435, "eval_samples_per_second": 235.837, "eval_steps_per_second": 29.515, "step": 18000 }, { "epoch": 0.77, "learning_rate": 5.275833333333334e-07, "loss": 1.0833, "step": 19000 }, { "epoch": 0.77, "eval_accuracy": 0.43233532934131735, "eval_loss": 1.06425940990448, "eval_runtime": 20.7923, "eval_samples_per_second": 240.955, "eval_steps_per_second": 30.155, "step": 19000 }, { "epoch": 0.81, "learning_rate": 5.553333333333334e-07, "loss": 1.0787, "step": 20000 }, { "epoch": 0.81, "eval_accuracy": 0.4295409181636727, "eval_loss": 1.0638529062271118, "eval_runtime": 21.0018, "eval_samples_per_second": 238.551, "eval_steps_per_second": 29.855, "step": 20000 }, { "epoch": 0.86, "learning_rate": 5.831111111111111e-07, "loss": 1.0779, "step": 21000 }, { "epoch": 0.86, "eval_accuracy": 0.4243512974051896, "eval_loss": 1.0603673458099365, "eval_runtime": 20.9689, "eval_samples_per_second": 238.926, "eval_steps_per_second": 29.901, "step": 21000 }, { "epoch": 0.9, "learning_rate": 6.108888888888888e-07, "loss": 1.0751, "step": 22000 }, { "epoch": 0.9, "eval_accuracy": 0.43233532934131735, "eval_loss": 1.0603009462356567, "eval_runtime": 20.8897, "eval_samples_per_second": 239.831, "eval_steps_per_second": 30.015, "step": 22000 }, { "epoch": 0.94, "learning_rate": 6.386388888888889e-07, "loss": 1.0776, "step": 23000 }, { "epoch": 0.94, "eval_accuracy": 0.42734530938123755, "eval_loss": 1.0591468811035156, "eval_runtime": 20.964, "eval_samples_per_second": 238.981, "eval_steps_per_second": 29.908, "step": 23000 }, { "epoch": 0.98, "learning_rate": 6.664166666666667e-07, "loss": 1.0754, "step": 24000 }, { "epoch": 0.98, "eval_accuracy": 0.4245508982035928, "eval_loss": 1.0589721202850342, "eval_runtime": 20.9053, "eval_samples_per_second": 239.652, "eval_steps_per_second": 29.992, "step": 24000 }, { "epoch": 1.02, "learning_rate": 6.941666666666667e-07, "loss": 1.0736, "step": 25000 }, { "epoch": 1.02, "eval_accuracy": 0.43213572854291415, "eval_loss": 1.0583962202072144, "eval_runtime": 21.3265, "eval_samples_per_second": 234.919, "eval_steps_per_second": 29.4, "step": 25000 }, { "epoch": 1.06, "learning_rate": 7.219444444444444e-07, "loss": 1.0717, "step": 26000 }, { "epoch": 1.06, "eval_accuracy": 0.4305389221556886, "eval_loss": 1.0561293363571167, "eval_runtime": 21.3034, "eval_samples_per_second": 235.174, "eval_steps_per_second": 29.432, "step": 26000 }, { "epoch": 1.1, "learning_rate": 7.496944444444444e-07, "loss": 1.0709, "step": 27000 }, { "epoch": 1.1, "eval_accuracy": 0.4281437125748503, "eval_loss": 1.0555357933044434, "eval_runtime": 21.2178, "eval_samples_per_second": 236.123, "eval_steps_per_second": 29.551, "step": 27000 }, { "epoch": 1.14, "learning_rate": 7.774722222222223e-07, "loss": 1.0701, "step": 28000 }, { "epoch": 1.14, "eval_accuracy": 0.4217564870259481, "eval_loss": 1.054961085319519, "eval_runtime": 21.1775, "eval_samples_per_second": 236.571, "eval_steps_per_second": 29.607, "step": 28000 } ], "logging_steps": 1000, "max_steps": 10000000, "num_train_epochs": 408, "save_steps": 1000, "total_flos": 1.9510951078526976e+16, "trial_name": null, "trial_params": null }