|
{ |
|
"best_metric": 0.0750546008348465, |
|
"best_model_checkpoint": "output/single-custom/quirky_sciq_raw/checkpoint-3000", |
|
"epoch": 2.3713071830846753, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 5.271481286241434e-07, |
|
"loss": 6.6565, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.0542962572482868e-06, |
|
"loss": 6.3909, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.5814443858724301e-06, |
|
"loss": 5.9624, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 2.1085925144965737e-06, |
|
"loss": 4.7754, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 2.635740643120717e-06, |
|
"loss": 3.2357, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 3.1628887717448603e-06, |
|
"loss": 1.789, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 3.690036900369004e-06, |
|
"loss": 0.6753, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 4.217185028993147e-06, |
|
"loss": 0.382, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 4.744333157617291e-06, |
|
"loss": 0.3311, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 5.271481286241434e-06, |
|
"loss": 0.3008, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_val_loss": 0.29452574253082275, |
|
"eval_val_runtime": 79.8728, |
|
"eval_val_samples_per_second": 22.686, |
|
"eval_val_steps_per_second": 2.842, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_val_alice_loss": 0.16685596108436584, |
|
"eval_val_alice_runtime": 39.3274, |
|
"eval_val_alice_samples_per_second": 23.165, |
|
"eval_val_alice_steps_per_second": 2.899, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_val_bob_loss": 0.42446252703666687, |
|
"eval_val_bob_runtime": 38.7654, |
|
"eval_val_bob_samples_per_second": 23.242, |
|
"eval_val_bob_steps_per_second": 2.915, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_val_bob_gt_loss": 0.16960681974887848, |
|
"eval_val_bob_gt_runtime": 38.7527, |
|
"eval_val_bob_gt_samples_per_second": 23.25, |
|
"eval_val_bob_gt_steps_per_second": 2.916, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 5.798629414865578e-06, |
|
"loss": 0.2991, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 6.325777543489721e-06, |
|
"loss": 0.286, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 6.852925672113865e-06, |
|
"loss": 0.2509, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 7.380073800738008e-06, |
|
"loss": 0.2708, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.907221929362151e-06, |
|
"loss": 0.2296, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 8.434370057986295e-06, |
|
"loss": 0.2405, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 8.961518186610438e-06, |
|
"loss": 0.2393, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 9.488666315234582e-06, |
|
"loss": 0.2311, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 1.0015814443858725e-05, |
|
"loss": 0.2079, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 1.0542962572482869e-05, |
|
"loss": 0.1969, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_val_loss": 0.1757764369249344, |
|
"eval_val_runtime": 80.2328, |
|
"eval_val_samples_per_second": 22.584, |
|
"eval_val_steps_per_second": 2.829, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_val_alice_loss": 0.11201322823762894, |
|
"eval_val_alice_runtime": 39.4556, |
|
"eval_val_alice_samples_per_second": 23.089, |
|
"eval_val_alice_steps_per_second": 2.889, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_val_bob_loss": 0.2400250881910324, |
|
"eval_val_bob_runtime": 38.8802, |
|
"eval_val_bob_samples_per_second": 23.174, |
|
"eval_val_bob_steps_per_second": 2.906, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_val_bob_gt_loss": 0.15817159414291382, |
|
"eval_val_bob_gt_runtime": 38.8537, |
|
"eval_val_bob_gt_samples_per_second": 23.19, |
|
"eval_val_bob_gt_steps_per_second": 2.908, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.1070110701107012e-05, |
|
"loss": 0.1825, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 1.1597258829731156e-05, |
|
"loss": 0.1826, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 1.21244069583553e-05, |
|
"loss": 0.177, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.2651555086979441e-05, |
|
"loss": 0.1951, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 1.3178703215603585e-05, |
|
"loss": 0.2114, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 1.370585134422773e-05, |
|
"loss": 0.1543, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 1.4232999472851872e-05, |
|
"loss": 0.1454, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 1.4760147601476015e-05, |
|
"loss": 0.1458, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 1.528729573010016e-05, |
|
"loss": 0.1637, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"learning_rate": 1.5814443858724302e-05, |
|
"loss": 0.1329, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"eval_val_loss": 0.12391708046197891, |
|
"eval_val_runtime": 80.057, |
|
"eval_val_samples_per_second": 22.634, |
|
"eval_val_steps_per_second": 2.835, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"eval_val_alice_loss": 0.11775108426809311, |
|
"eval_val_alice_runtime": 39.3993, |
|
"eval_val_alice_samples_per_second": 23.122, |
|
"eval_val_alice_steps_per_second": 2.893, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"eval_val_bob_loss": 0.13003212213516235, |
|
"eval_val_bob_runtime": 38.8672, |
|
"eval_val_bob_samples_per_second": 23.182, |
|
"eval_val_bob_steps_per_second": 2.907, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"eval_val_bob_gt_loss": 0.25759872794151306, |
|
"eval_val_bob_gt_runtime": 38.8511, |
|
"eval_val_bob_gt_samples_per_second": 23.191, |
|
"eval_val_bob_gt_steps_per_second": 2.909, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 1.6341591987348446e-05, |
|
"loss": 0.1552, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 1.686874011597259e-05, |
|
"loss": 0.1518, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 1.7395888244596733e-05, |
|
"loss": 0.1137, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 1.7923036373220876e-05, |
|
"loss": 0.1424, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 1.845018450184502e-05, |
|
"loss": 0.1131, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 1.8977332630469163e-05, |
|
"loss": 0.0887, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"learning_rate": 1.9504480759093307e-05, |
|
"loss": 0.1046, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"learning_rate": 1.9994420161815308e-05, |
|
"loss": 0.1071, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 1.99014228587371e-05, |
|
"loss": 0.092, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"learning_rate": 1.980842555565889e-05, |
|
"loss": 0.1167, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_val_loss": 0.09470801800489426, |
|
"eval_val_runtime": 79.8646, |
|
"eval_val_samples_per_second": 22.688, |
|
"eval_val_steps_per_second": 2.842, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_val_alice_loss": 0.09601828455924988, |
|
"eval_val_alice_runtime": 39.3347, |
|
"eval_val_alice_samples_per_second": 23.16, |
|
"eval_val_alice_steps_per_second": 2.898, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_val_bob_loss": 0.09395376592874527, |
|
"eval_val_bob_runtime": 38.7824, |
|
"eval_val_bob_samples_per_second": 23.232, |
|
"eval_val_bob_steps_per_second": 2.914, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_val_bob_gt_loss": 0.4563288986682892, |
|
"eval_val_bob_gt_runtime": 38.7783, |
|
"eval_val_bob_gt_samples_per_second": 23.235, |
|
"eval_val_bob_gt_steps_per_second": 2.914, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"learning_rate": 1.971542825258068e-05, |
|
"loss": 0.1076, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 1.9622430949502466e-05, |
|
"loss": 0.0904, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"learning_rate": 1.9529433646424256e-05, |
|
"loss": 0.0718, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"learning_rate": 1.9436436343346043e-05, |
|
"loss": 0.0889, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"learning_rate": 1.9343439040267833e-05, |
|
"loss": 0.0987, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 1.9250441737189623e-05, |
|
"loss": 0.0782, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 1.9157444434111413e-05, |
|
"loss": 0.086, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"learning_rate": 1.90644471310332e-05, |
|
"loss": 0.094, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 1.897144982795499e-05, |
|
"loss": 0.0723, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"learning_rate": 1.887845252487678e-05, |
|
"loss": 0.096, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_val_loss": 0.08209705352783203, |
|
"eval_val_runtime": 80.0462, |
|
"eval_val_samples_per_second": 22.637, |
|
"eval_val_steps_per_second": 2.836, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_val_alice_loss": 0.06850626319646835, |
|
"eval_val_alice_runtime": 39.4169, |
|
"eval_val_alice_samples_per_second": 23.112, |
|
"eval_val_alice_steps_per_second": 2.892, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_val_bob_loss": 0.09484687447547913, |
|
"eval_val_bob_runtime": 38.8822, |
|
"eval_val_bob_samples_per_second": 23.173, |
|
"eval_val_bob_steps_per_second": 2.906, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_val_bob_gt_loss": 0.43703609704971313, |
|
"eval_val_bob_gt_runtime": 38.8648, |
|
"eval_val_bob_gt_samples_per_second": 23.183, |
|
"eval_val_bob_gt_steps_per_second": 2.908, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"learning_rate": 1.878545522179857e-05, |
|
"loss": 0.0841, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"learning_rate": 1.8692457918720358e-05, |
|
"loss": 0.0527, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 1.8599460615642148e-05, |
|
"loss": 0.0527, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"learning_rate": 1.8506463312563938e-05, |
|
"loss": 0.055, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 1.8413466009485725e-05, |
|
"loss": 0.0642, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 1.8320468706407515e-05, |
|
"loss": 0.0574, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 1.8227471403329305e-05, |
|
"loss": 0.0752, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 1.8134474100251092e-05, |
|
"loss": 0.0779, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"learning_rate": 1.8041476797172882e-05, |
|
"loss": 0.0263, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 1.7948479494094673e-05, |
|
"loss": 0.0461, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_val_loss": 0.0750546008348465, |
|
"eval_val_runtime": 80.0194, |
|
"eval_val_samples_per_second": 22.645, |
|
"eval_val_steps_per_second": 2.837, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_val_alice_loss": 0.07222797721624374, |
|
"eval_val_alice_runtime": 39.3985, |
|
"eval_val_alice_samples_per_second": 23.123, |
|
"eval_val_alice_steps_per_second": 2.894, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_val_bob_loss": 0.07870974391698837, |
|
"eval_val_bob_runtime": 38.8401, |
|
"eval_val_bob_samples_per_second": 23.198, |
|
"eval_val_bob_steps_per_second": 2.909, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_val_bob_gt_loss": 0.6858129501342773, |
|
"eval_val_bob_gt_runtime": 38.8232, |
|
"eval_val_bob_gt_samples_per_second": 23.208, |
|
"eval_val_bob_gt_steps_per_second": 2.911, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 12650, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.1701103468235817e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|