{ "best_metric": 0.0750546008348465, "best_model_checkpoint": "output/single-custom/quirky_sciq_raw/checkpoint-3000", "epoch": 2.3713071830846753, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 5.271481286241434e-07, "loss": 6.6565, "step": 50 }, { "epoch": 0.08, "learning_rate": 1.0542962572482868e-06, "loss": 6.3909, "step": 100 }, { "epoch": 0.12, "learning_rate": 1.5814443858724301e-06, "loss": 5.9624, "step": 150 }, { "epoch": 0.16, "learning_rate": 2.1085925144965737e-06, "loss": 4.7754, "step": 200 }, { "epoch": 0.2, "learning_rate": 2.635740643120717e-06, "loss": 3.2357, "step": 250 }, { "epoch": 0.24, "learning_rate": 3.1628887717448603e-06, "loss": 1.789, "step": 300 }, { "epoch": 0.28, "learning_rate": 3.690036900369004e-06, "loss": 0.6753, "step": 350 }, { "epoch": 0.32, "learning_rate": 4.217185028993147e-06, "loss": 0.382, "step": 400 }, { "epoch": 0.36, "learning_rate": 4.744333157617291e-06, "loss": 0.3311, "step": 450 }, { "epoch": 0.4, "learning_rate": 5.271481286241434e-06, "loss": 0.3008, "step": 500 }, { "epoch": 0.4, "eval_val_loss": 0.29452574253082275, "eval_val_runtime": 79.8728, "eval_val_samples_per_second": 22.686, "eval_val_steps_per_second": 2.842, "step": 500 }, { "epoch": 0.4, "eval_val_alice_loss": 0.16685596108436584, "eval_val_alice_runtime": 39.3274, "eval_val_alice_samples_per_second": 23.165, "eval_val_alice_steps_per_second": 2.899, "step": 500 }, { "epoch": 0.4, "eval_val_bob_loss": 0.42446252703666687, "eval_val_bob_runtime": 38.7654, "eval_val_bob_samples_per_second": 23.242, "eval_val_bob_steps_per_second": 2.915, "step": 500 }, { "epoch": 0.4, "eval_val_bob_gt_loss": 0.16960681974887848, "eval_val_bob_gt_runtime": 38.7527, "eval_val_bob_gt_samples_per_second": 23.25, "eval_val_bob_gt_steps_per_second": 2.916, "step": 500 }, { "epoch": 0.43, "learning_rate": 5.798629414865578e-06, "loss": 0.2991, "step": 550 }, { "epoch": 0.47, "learning_rate": 6.325777543489721e-06, "loss": 0.286, "step": 600 }, { "epoch": 0.51, "learning_rate": 6.852925672113865e-06, "loss": 0.2509, "step": 650 }, { "epoch": 0.55, "learning_rate": 7.380073800738008e-06, "loss": 0.2708, "step": 700 }, { "epoch": 0.59, "learning_rate": 7.907221929362151e-06, "loss": 0.2296, "step": 750 }, { "epoch": 0.63, "learning_rate": 8.434370057986295e-06, "loss": 0.2405, "step": 800 }, { "epoch": 0.67, "learning_rate": 8.961518186610438e-06, "loss": 0.2393, "step": 850 }, { "epoch": 0.71, "learning_rate": 9.488666315234582e-06, "loss": 0.2311, "step": 900 }, { "epoch": 0.75, "learning_rate": 1.0015814443858725e-05, "loss": 0.2079, "step": 950 }, { "epoch": 0.79, "learning_rate": 1.0542962572482869e-05, "loss": 0.1969, "step": 1000 }, { "epoch": 0.79, "eval_val_loss": 0.1757764369249344, "eval_val_runtime": 80.2328, "eval_val_samples_per_second": 22.584, "eval_val_steps_per_second": 2.829, "step": 1000 }, { "epoch": 0.79, "eval_val_alice_loss": 0.11201322823762894, "eval_val_alice_runtime": 39.4556, "eval_val_alice_samples_per_second": 23.089, "eval_val_alice_steps_per_second": 2.889, "step": 1000 }, { "epoch": 0.79, "eval_val_bob_loss": 0.2400250881910324, "eval_val_bob_runtime": 38.8802, "eval_val_bob_samples_per_second": 23.174, "eval_val_bob_steps_per_second": 2.906, "step": 1000 }, { "epoch": 0.79, "eval_val_bob_gt_loss": 0.15817159414291382, "eval_val_bob_gt_runtime": 38.8537, "eval_val_bob_gt_samples_per_second": 23.19, "eval_val_bob_gt_steps_per_second": 2.908, "step": 1000 }, { "epoch": 0.83, "learning_rate": 1.1070110701107012e-05, "loss": 0.1825, "step": 1050 }, { "epoch": 0.87, "learning_rate": 1.1597258829731156e-05, "loss": 0.1826, "step": 1100 }, { "epoch": 0.91, "learning_rate": 1.21244069583553e-05, "loss": 0.177, "step": 1150 }, { "epoch": 0.95, "learning_rate": 1.2651555086979441e-05, "loss": 0.1951, "step": 1200 }, { "epoch": 0.99, "learning_rate": 1.3178703215603585e-05, "loss": 0.2114, "step": 1250 }, { "epoch": 1.03, "learning_rate": 1.370585134422773e-05, "loss": 0.1543, "step": 1300 }, { "epoch": 1.07, "learning_rate": 1.4232999472851872e-05, "loss": 0.1454, "step": 1350 }, { "epoch": 1.11, "learning_rate": 1.4760147601476015e-05, "loss": 0.1458, "step": 1400 }, { "epoch": 1.15, "learning_rate": 1.528729573010016e-05, "loss": 0.1637, "step": 1450 }, { "epoch": 1.19, "learning_rate": 1.5814443858724302e-05, "loss": 0.1329, "step": 1500 }, { "epoch": 1.19, "eval_val_loss": 0.12391708046197891, "eval_val_runtime": 80.057, "eval_val_samples_per_second": 22.634, "eval_val_steps_per_second": 2.835, "step": 1500 }, { "epoch": 1.19, "eval_val_alice_loss": 0.11775108426809311, "eval_val_alice_runtime": 39.3993, "eval_val_alice_samples_per_second": 23.122, "eval_val_alice_steps_per_second": 2.893, "step": 1500 }, { "epoch": 1.19, "eval_val_bob_loss": 0.13003212213516235, "eval_val_bob_runtime": 38.8672, "eval_val_bob_samples_per_second": 23.182, "eval_val_bob_steps_per_second": 2.907, "step": 1500 }, { "epoch": 1.19, "eval_val_bob_gt_loss": 0.25759872794151306, "eval_val_bob_gt_runtime": 38.8511, "eval_val_bob_gt_samples_per_second": 23.191, "eval_val_bob_gt_steps_per_second": 2.909, "step": 1500 }, { "epoch": 1.23, "learning_rate": 1.6341591987348446e-05, "loss": 0.1552, "step": 1550 }, { "epoch": 1.26, "learning_rate": 1.686874011597259e-05, "loss": 0.1518, "step": 1600 }, { "epoch": 1.3, "learning_rate": 1.7395888244596733e-05, "loss": 0.1137, "step": 1650 }, { "epoch": 1.34, "learning_rate": 1.7923036373220876e-05, "loss": 0.1424, "step": 1700 }, { "epoch": 1.38, "learning_rate": 1.845018450184502e-05, "loss": 0.1131, "step": 1750 }, { "epoch": 1.42, "learning_rate": 1.8977332630469163e-05, "loss": 0.0887, "step": 1800 }, { "epoch": 1.46, "learning_rate": 1.9504480759093307e-05, "loss": 0.1046, "step": 1850 }, { "epoch": 1.5, "learning_rate": 1.9994420161815308e-05, "loss": 0.1071, "step": 1900 }, { "epoch": 1.54, "learning_rate": 1.99014228587371e-05, "loss": 0.092, "step": 1950 }, { "epoch": 1.58, "learning_rate": 1.980842555565889e-05, "loss": 0.1167, "step": 2000 }, { "epoch": 1.58, "eval_val_loss": 0.09470801800489426, "eval_val_runtime": 79.8646, "eval_val_samples_per_second": 22.688, "eval_val_steps_per_second": 2.842, "step": 2000 }, { "epoch": 1.58, "eval_val_alice_loss": 0.09601828455924988, "eval_val_alice_runtime": 39.3347, "eval_val_alice_samples_per_second": 23.16, "eval_val_alice_steps_per_second": 2.898, "step": 2000 }, { "epoch": 1.58, "eval_val_bob_loss": 0.09395376592874527, "eval_val_bob_runtime": 38.7824, "eval_val_bob_samples_per_second": 23.232, "eval_val_bob_steps_per_second": 2.914, "step": 2000 }, { "epoch": 1.58, "eval_val_bob_gt_loss": 0.4563288986682892, "eval_val_bob_gt_runtime": 38.7783, "eval_val_bob_gt_samples_per_second": 23.235, "eval_val_bob_gt_steps_per_second": 2.914, "step": 2000 }, { "epoch": 1.62, "learning_rate": 1.971542825258068e-05, "loss": 0.1076, "step": 2050 }, { "epoch": 1.66, "learning_rate": 1.9622430949502466e-05, "loss": 0.0904, "step": 2100 }, { "epoch": 1.7, "learning_rate": 1.9529433646424256e-05, "loss": 0.0718, "step": 2150 }, { "epoch": 1.74, "learning_rate": 1.9436436343346043e-05, "loss": 0.0889, "step": 2200 }, { "epoch": 1.78, "learning_rate": 1.9343439040267833e-05, "loss": 0.0987, "step": 2250 }, { "epoch": 1.82, "learning_rate": 1.9250441737189623e-05, "loss": 0.0782, "step": 2300 }, { "epoch": 1.86, "learning_rate": 1.9157444434111413e-05, "loss": 0.086, "step": 2350 }, { "epoch": 1.9, "learning_rate": 1.90644471310332e-05, "loss": 0.094, "step": 2400 }, { "epoch": 1.94, "learning_rate": 1.897144982795499e-05, "loss": 0.0723, "step": 2450 }, { "epoch": 1.98, "learning_rate": 1.887845252487678e-05, "loss": 0.096, "step": 2500 }, { "epoch": 1.98, "eval_val_loss": 0.08209705352783203, "eval_val_runtime": 80.0462, "eval_val_samples_per_second": 22.637, "eval_val_steps_per_second": 2.836, "step": 2500 }, { "epoch": 1.98, "eval_val_alice_loss": 0.06850626319646835, "eval_val_alice_runtime": 39.4169, "eval_val_alice_samples_per_second": 23.112, "eval_val_alice_steps_per_second": 2.892, "step": 2500 }, { "epoch": 1.98, "eval_val_bob_loss": 0.09484687447547913, "eval_val_bob_runtime": 38.8822, "eval_val_bob_samples_per_second": 23.173, "eval_val_bob_steps_per_second": 2.906, "step": 2500 }, { "epoch": 1.98, "eval_val_bob_gt_loss": 0.43703609704971313, "eval_val_bob_gt_runtime": 38.8648, "eval_val_bob_gt_samples_per_second": 23.183, "eval_val_bob_gt_steps_per_second": 2.908, "step": 2500 }, { "epoch": 2.02, "learning_rate": 1.878545522179857e-05, "loss": 0.0841, "step": 2550 }, { "epoch": 2.06, "learning_rate": 1.8692457918720358e-05, "loss": 0.0527, "step": 2600 }, { "epoch": 2.09, "learning_rate": 1.8599460615642148e-05, "loss": 0.0527, "step": 2650 }, { "epoch": 2.13, "learning_rate": 1.8506463312563938e-05, "loss": 0.055, "step": 2700 }, { "epoch": 2.17, "learning_rate": 1.8413466009485725e-05, "loss": 0.0642, "step": 2750 }, { "epoch": 2.21, "learning_rate": 1.8320468706407515e-05, "loss": 0.0574, "step": 2800 }, { "epoch": 2.25, "learning_rate": 1.8227471403329305e-05, "loss": 0.0752, "step": 2850 }, { "epoch": 2.29, "learning_rate": 1.8134474100251092e-05, "loss": 0.0779, "step": 2900 }, { "epoch": 2.33, "learning_rate": 1.8041476797172882e-05, "loss": 0.0263, "step": 2950 }, { "epoch": 2.37, "learning_rate": 1.7948479494094673e-05, "loss": 0.0461, "step": 3000 }, { "epoch": 2.37, "eval_val_loss": 0.0750546008348465, "eval_val_runtime": 80.0194, "eval_val_samples_per_second": 22.645, "eval_val_steps_per_second": 2.837, "step": 3000 }, { "epoch": 2.37, "eval_val_alice_loss": 0.07222797721624374, "eval_val_alice_runtime": 39.3985, "eval_val_alice_samples_per_second": 23.123, "eval_val_alice_steps_per_second": 2.894, "step": 3000 }, { "epoch": 2.37, "eval_val_bob_loss": 0.07870974391698837, "eval_val_bob_runtime": 38.8401, "eval_val_bob_samples_per_second": 23.198, "eval_val_bob_steps_per_second": 2.909, "step": 3000 }, { "epoch": 2.37, "eval_val_bob_gt_loss": 0.6858129501342773, "eval_val_bob_gt_runtime": 38.8232, "eval_val_bob_gt_samples_per_second": 23.208, "eval_val_bob_gt_steps_per_second": 2.911, "step": 3000 } ], "logging_steps": 50, "max_steps": 12650, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.1701103468235817e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }