{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 29280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17, "grad_norm": 1.187337040901184, "learning_rate": 6.25e-05, "loss": 6.2023, "step": 500 }, { "epoch": 0.34, "grad_norm": 1.0917108058929443, "learning_rate": 0.000125, "loss": 3.6263, "step": 1000 }, { "epoch": 0.51, "grad_norm": 1.0571826696395874, "learning_rate": 0.0001875, "loss": 3.3348, "step": 1500 }, { "epoch": 0.68, "grad_norm": 0.9771557450294495, "learning_rate": 0.00025, "loss": 3.1685, "step": 2000 }, { "epoch": 0.85, "grad_norm": 0.9500265717506409, "learning_rate": 0.0003125, "loss": 3.0457, "step": 2500 }, { "epoch": 1.0, "eval_accuracy": 0.4381255950220651, "eval_loss": 3.016178607940674, "eval_runtime": 3.353, "eval_samples_per_second": 1339.396, "eval_steps_per_second": 10.737, "step": 2928 }, { "epoch": 1.02, "grad_norm": 0.9907193779945374, "learning_rate": 0.000375, "loss": 2.9546, "step": 3000 }, { "epoch": 1.2, "grad_norm": 0.9292730093002319, "learning_rate": 0.00043750000000000006, "loss": 2.8611, "step": 3500 }, { "epoch": 1.37, "grad_norm": 0.8982778787612915, "learning_rate": 0.0005, "loss": 2.8115, "step": 4000 }, { "epoch": 1.54, "grad_norm": 0.899147093296051, "learning_rate": 0.0005625000000000001, "loss": 2.7697, "step": 4500 }, { "epoch": 1.71, "grad_norm": 0.889224648475647, "learning_rate": 0.000625, "loss": 2.7455, "step": 5000 }, { "epoch": 1.88, "grad_norm": 0.8336099982261658, "learning_rate": 0.0006875, "loss": 2.7215, "step": 5500 }, { "epoch": 2.0, "eval_accuracy": 0.4596735027360057, "eval_loss": 2.7846643924713135, "eval_runtime": 3.384, "eval_samples_per_second": 1327.111, "eval_steps_per_second": 10.638, "step": 5856 }, { "epoch": 2.05, "grad_norm": 0.8637300729751587, "learning_rate": 0.00075, "loss": 2.6694, "step": 6000 }, { "epoch": 2.22, "grad_norm": 0.7789953351020813, "learning_rate": 0.0008125, "loss": 2.6306, "step": 6500 }, { "epoch": 2.39, "grad_norm": 0.7371299862861633, "learning_rate": 0.0008750000000000001, "loss": 2.6088, "step": 7000 }, { "epoch": 2.56, "grad_norm": 0.6360913515090942, "learning_rate": 0.0009375, "loss": 2.6008, "step": 7500 }, { "epoch": 2.73, "grad_norm": 0.6919972896575928, "learning_rate": 0.001, "loss": 2.5956, "step": 8000 }, { "epoch": 2.9, "grad_norm": 0.653573751449585, "learning_rate": 0.0010625, "loss": 2.5825, "step": 8500 }, { "epoch": 3.0, "eval_accuracy": 0.47033349288252796, "eval_loss": 2.6924686431884766, "eval_runtime": 3.3904, "eval_samples_per_second": 1324.616, "eval_steps_per_second": 10.618, "step": 8784 }, { "epoch": 3.07, "grad_norm": 0.621619701385498, "learning_rate": 0.0011250000000000001, "loss": 2.5387, "step": 9000 }, { "epoch": 3.24, "grad_norm": 0.6471651792526245, "learning_rate": 0.0011875, "loss": 2.5135, "step": 9500 }, { "epoch": 3.42, "grad_norm": 0.5801098942756653, "learning_rate": 0.00125, "loss": 2.5157, "step": 10000 }, { "epoch": 3.59, "grad_norm": 0.5154350399971008, "learning_rate": 0.0013125, "loss": 2.5178, "step": 10500 }, { "epoch": 3.76, "grad_norm": 0.4794241189956665, "learning_rate": 0.001375, "loss": 2.5135, "step": 11000 }, { "epoch": 3.93, "grad_norm": 0.4597519040107727, "learning_rate": 0.0014375000000000002, "loss": 2.5059, "step": 11500 }, { "epoch": 4.0, "eval_accuracy": 0.47582479043826936, "eval_loss": 2.6392264366149902, "eval_runtime": 3.4286, "eval_samples_per_second": 1309.863, "eval_steps_per_second": 10.5, "step": 11712 }, { "epoch": 4.1, "grad_norm": 0.4645042419433594, "learning_rate": 0.0015, "loss": 2.4568, "step": 12000 }, { "epoch": 4.27, "grad_norm": 0.46030470728874207, "learning_rate": 0.0015625, "loss": 2.4486, "step": 12500 }, { "epoch": 4.44, "grad_norm": 0.44470083713531494, "learning_rate": 0.001625, "loss": 2.4657, "step": 13000 }, { "epoch": 4.61, "grad_norm": 0.4096473157405853, "learning_rate": 0.0016875, "loss": 2.4596, "step": 13500 }, { "epoch": 4.78, "grad_norm": 0.3900606334209442, "learning_rate": 0.0017500000000000003, "loss": 2.4585, "step": 14000 }, { "epoch": 4.95, "grad_norm": 0.40039312839508057, "learning_rate": 0.0018124999999999999, "loss": 2.4667, "step": 14500 }, { "epoch": 5.0, "eval_accuracy": 0.480080020057613, "eval_loss": 2.6032490730285645, "eval_runtime": 3.3841, "eval_samples_per_second": 1327.1, "eval_steps_per_second": 10.638, "step": 14640 }, { "epoch": 5.12, "grad_norm": 0.3943752348423004, "learning_rate": 0.001875, "loss": 2.4054, "step": 15000 }, { "epoch": 5.29, "grad_norm": 0.4194723069667816, "learning_rate": 0.0019375000000000002, "loss": 2.4006, "step": 15500 }, { "epoch": 5.46, "grad_norm": 0.3874049484729767, "learning_rate": 0.002, "loss": 2.4215, "step": 16000 }, { "epoch": 5.64, "grad_norm": 0.4187428057193756, "learning_rate": 0.0020625, "loss": 2.409, "step": 16500 }, { "epoch": 5.81, "grad_norm": 0.3771882951259613, "learning_rate": 0.002125, "loss": 2.4208, "step": 17000 }, { "epoch": 5.98, "grad_norm": 0.34806564450263977, "learning_rate": 0.0021874999999999998, "loss": 2.4261, "step": 17500 }, { "epoch": 6.0, "eval_accuracy": 0.4827099518371827, "eval_loss": 2.585068941116333, "eval_runtime": 3.4455, "eval_samples_per_second": 1303.458, "eval_steps_per_second": 10.449, "step": 17568 }, { "epoch": 6.15, "grad_norm": 0.38157713413238525, "learning_rate": 0.0022500000000000003, "loss": 2.3536, "step": 18000 }, { "epoch": 6.32, "grad_norm": 0.35192397236824036, "learning_rate": 0.0023125000000000003, "loss": 2.3635, "step": 18500 }, { "epoch": 6.49, "grad_norm": 0.36528274416923523, "learning_rate": 0.002375, "loss": 2.3799, "step": 19000 }, { "epoch": 6.66, "grad_norm": 0.3123536705970764, "learning_rate": 0.0024375, "loss": 2.3808, "step": 19500 }, { "epoch": 6.83, "grad_norm": 0.32904407382011414, "learning_rate": 0.0025, "loss": 2.3888, "step": 20000 }, { "epoch": 7.0, "eval_accuracy": 0.484838443290781, "eval_loss": 2.566373348236084, "eval_runtime": 3.3915, "eval_samples_per_second": 1324.208, "eval_steps_per_second": 10.615, "step": 20496 }, { "epoch": 7.0, "grad_norm": 0.316089928150177, "learning_rate": 0.0025625, "loss": 2.397, "step": 20500 }, { "epoch": 7.17, "grad_norm": 0.36722826957702637, "learning_rate": 0.002625, "loss": 2.3088, "step": 21000 }, { "epoch": 7.34, "grad_norm": 0.3584025800228119, "learning_rate": 0.0026875000000000002, "loss": 2.3392, "step": 21500 }, { "epoch": 7.51, "grad_norm": 0.39147356152534485, "learning_rate": 0.00275, "loss": 2.3481, "step": 22000 }, { "epoch": 7.68, "grad_norm": 0.30665042996406555, "learning_rate": 0.0028125, "loss": 2.3524, "step": 22500 }, { "epoch": 7.86, "grad_norm": 0.34735408425331116, "learning_rate": 0.0028750000000000004, "loss": 2.3623, "step": 23000 }, { "epoch": 8.0, "eval_accuracy": 0.4860008731373508, "eval_loss": 2.5573480129241943, "eval_runtime": 3.38, "eval_samples_per_second": 1328.716, "eval_steps_per_second": 10.651, "step": 23424 }, { "epoch": 8.03, "grad_norm": 0.3066130578517914, "learning_rate": 0.0029375, "loss": 2.359, "step": 23500 }, { "epoch": 8.2, "grad_norm": 0.4078517556190491, "learning_rate": 0.003, "loss": 2.285, "step": 24000 }, { "epoch": 8.37, "grad_norm": 0.2971705198287964, "learning_rate": 0.002715909090909091, "loss": 2.3045, "step": 24500 }, { "epoch": 8.54, "grad_norm": 0.30169880390167236, "learning_rate": 0.0024318181818181817, "loss": 2.3081, "step": 25000 }, { "epoch": 8.71, "grad_norm": 0.30579808354377747, "learning_rate": 0.002147727272727273, "loss": 2.2906, "step": 25500 }, { "epoch": 8.88, "grad_norm": 0.3257807791233063, "learning_rate": 0.0018636363636363638, "loss": 2.2903, "step": 26000 }, { "epoch": 9.0, "eval_accuracy": 0.4940852834277479, "eval_loss": 2.502375841140747, "eval_runtime": 3.3782, "eval_samples_per_second": 1329.388, "eval_steps_per_second": 10.656, "step": 26352 }, { "epoch": 9.05, "grad_norm": 0.27855226397514343, "learning_rate": 0.0015795454545454546, "loss": 2.2424, "step": 26500 }, { "epoch": 9.22, "grad_norm": 0.2956109941005707, "learning_rate": 0.0012954545454545456, "loss": 2.1635, "step": 27000 }, { "epoch": 9.39, "grad_norm": 0.3226345479488373, "learning_rate": 0.0010113636363636364, "loss": 2.1661, "step": 27500 }, { "epoch": 9.56, "grad_norm": 0.3127693831920624, "learning_rate": 0.0007272727272727273, "loss": 2.1605, "step": 28000 }, { "epoch": 9.73, "grad_norm": 0.3304063379764557, "learning_rate": 0.0004431818181818182, "loss": 2.1557, "step": 28500 }, { "epoch": 9.9, "grad_norm": 0.3265271782875061, "learning_rate": 0.0001590909090909091, "loss": 2.135, "step": 29000 }, { "epoch": 10.0, "eval_accuracy": 0.4996186598919624, "eval_loss": 2.4745402336120605, "eval_runtime": 3.376, "eval_samples_per_second": 1330.26, "eval_steps_per_second": 10.663, "step": 29280 }, { "epoch": 10.0, "step": 29280, "total_flos": 2273237316403200.0, "train_loss": 2.566726820065024, "train_runtime": 751.1564, "train_samples_per_second": 623.638, "train_steps_per_second": 38.98 } ], "logging_steps": 500, "max_steps": 29280, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2000, "total_flos": 2273237316403200.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }