|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.946666666666665, |
|
"eval_steps": 500, |
|
"global_step": 3740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.26794755458831787, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8512, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9973333333333333, |
|
"eval_accuracy": 0.6031838565022422, |
|
"eval_loss": 1.7184041738510132, |
|
"eval_runtime": 6.0677, |
|
"eval_samples_per_second": 82.403, |
|
"eval_steps_per_second": 10.383, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.37397125363349915, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7523, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.35164713859558105, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7174, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.604932735426009, |
|
"eval_loss": 1.70177161693573, |
|
"eval_runtime": 6.4641, |
|
"eval_samples_per_second": 77.351, |
|
"eval_steps_per_second": 9.746, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 0.3863197863101959, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7212, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.4171089828014374, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6805, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.997333333333333, |
|
"eval_accuracy": 0.6061434977578475, |
|
"eval_loss": 1.693791151046753, |
|
"eval_runtime": 6.1109, |
|
"eval_samples_per_second": 81.821, |
|
"eval_steps_per_second": 10.309, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.5697806477546692, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6692, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.7333333333333334, |
|
"grad_norm": 0.5351872444152832, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6412, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.6067354260089686, |
|
"eval_loss": 1.6924760341644287, |
|
"eval_runtime": 6.7092, |
|
"eval_samples_per_second": 74.525, |
|
"eval_steps_per_second": 9.39, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.266666666666667, |
|
"grad_norm": 0.6729754209518433, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6073, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.7071828246116638, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5834, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.997333333333334, |
|
"eval_accuracy": 0.6062242152466367, |
|
"eval_loss": 1.7046513557434082, |
|
"eval_runtime": 6.7276, |
|
"eval_samples_per_second": 74.32, |
|
"eval_steps_per_second": 9.364, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.8920167684555054, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5161, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.866666666666667, |
|
"grad_norm": 0.8955753445625305, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5304, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.6056143497757848, |
|
"eval_loss": 1.723886251449585, |
|
"eval_runtime": 6.7785, |
|
"eval_samples_per_second": 73.763, |
|
"eval_steps_per_second": 9.294, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.9878254532814026, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4575, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.933333333333334, |
|
"grad_norm": 1.1449097394943237, |
|
"learning_rate": 3e-05, |
|
"loss": 1.452, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.997333333333334, |
|
"eval_accuracy": 0.6039192825112107, |
|
"eval_loss": 1.7508341073989868, |
|
"eval_runtime": 6.0652, |
|
"eval_samples_per_second": 82.437, |
|
"eval_steps_per_second": 10.387, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 7.466666666666667, |
|
"grad_norm": 1.1436129808425903, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3916, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.1151554584503174, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3847, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.602762331838565, |
|
"eval_loss": 1.7711397409439087, |
|
"eval_runtime": 6.0614, |
|
"eval_samples_per_second": 82.489, |
|
"eval_steps_per_second": 10.394, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.533333333333333, |
|
"grad_norm": 1.3540856838226318, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3177, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.997333333333334, |
|
"eval_accuracy": 0.6008699551569506, |
|
"eval_loss": 1.8049190044403076, |
|
"eval_runtime": 6.505, |
|
"eval_samples_per_second": 76.864, |
|
"eval_steps_per_second": 9.685, |
|
"step": 1687 |
|
}, |
|
{ |
|
"epoch": 9.066666666666666, |
|
"grad_norm": 1.5734765529632568, |
|
"learning_rate": 3e-05, |
|
"loss": 1.326, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 1.6235650777816772, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2747, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.599847533632287, |
|
"eval_loss": 1.8297590017318726, |
|
"eval_runtime": 6.3451, |
|
"eval_samples_per_second": 78.801, |
|
"eval_steps_per_second": 9.929, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 10.133333333333333, |
|
"grad_norm": 1.6720285415649414, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2459, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 10.666666666666666, |
|
"grad_norm": 1.6803735494613647, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2202, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.997333333333334, |
|
"eval_accuracy": 0.5980717488789238, |
|
"eval_loss": 1.881359338760376, |
|
"eval_runtime": 5.6855, |
|
"eval_samples_per_second": 87.942, |
|
"eval_steps_per_second": 11.081, |
|
"step": 2062 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 1.7732399702072144, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1909, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 11.733333333333333, |
|
"grad_norm": 1.6892499923706055, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1589, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.5959372197309417, |
|
"eval_loss": 1.9310756921768188, |
|
"eval_runtime": 6.0847, |
|
"eval_samples_per_second": 82.173, |
|
"eval_steps_per_second": 10.354, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 12.266666666666667, |
|
"grad_norm": 1.7170026302337646, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1364, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 1.7617255449295044, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1231, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 12.997333333333334, |
|
"eval_accuracy": 0.5955336322869955, |
|
"eval_loss": 1.9429402351379395, |
|
"eval_runtime": 6.8196, |
|
"eval_samples_per_second": 73.318, |
|
"eval_steps_per_second": 9.238, |
|
"step": 2437 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 2.0009663105010986, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0823, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 13.866666666666667, |
|
"grad_norm": 2.044010877609253, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0624, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.593255605381166, |
|
"eval_loss": 1.996883511543274, |
|
"eval_runtime": 6.0345, |
|
"eval_samples_per_second": 82.857, |
|
"eval_steps_per_second": 10.44, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 2.0219309329986572, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0217, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 14.933333333333334, |
|
"grad_norm": 2.0420074462890625, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0185, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 14.997333333333334, |
|
"eval_accuracy": 0.592170403587444, |
|
"eval_loss": 2.031921625137329, |
|
"eval_runtime": 5.9337, |
|
"eval_samples_per_second": 84.265, |
|
"eval_steps_per_second": 10.617, |
|
"step": 2812 |
|
}, |
|
{ |
|
"epoch": 15.466666666666667, |
|
"grad_norm": 2.2069709300994873, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9645, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 1.863767147064209, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9718, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.5902959641255605, |
|
"eval_loss": 2.0798022747039795, |
|
"eval_runtime": 6.143, |
|
"eval_samples_per_second": 81.394, |
|
"eval_steps_per_second": 10.256, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 16.533333333333335, |
|
"grad_norm": 2.300877809524536, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9101, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 16.997333333333334, |
|
"eval_accuracy": 0.5887354260089687, |
|
"eval_loss": 2.13960337638855, |
|
"eval_runtime": 6.0122, |
|
"eval_samples_per_second": 83.164, |
|
"eval_steps_per_second": 10.479, |
|
"step": 3187 |
|
}, |
|
{ |
|
"epoch": 17.066666666666666, |
|
"grad_norm": 2.586768388748169, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9136, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 2.3870368003845215, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8606, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.5870134529147982, |
|
"eval_loss": 2.1882169246673584, |
|
"eval_runtime": 6.0214, |
|
"eval_samples_per_second": 83.036, |
|
"eval_steps_per_second": 10.463, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 18.133333333333333, |
|
"grad_norm": 2.6867830753326416, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8724, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 18.666666666666668, |
|
"grad_norm": 2.807380199432373, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8168, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 18.997333333333334, |
|
"eval_accuracy": 0.5862600896860987, |
|
"eval_loss": 2.2291159629821777, |
|
"eval_runtime": 6.3423, |
|
"eval_samples_per_second": 78.836, |
|
"eval_steps_per_second": 9.933, |
|
"step": 3562 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 2.6379778385162354, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8061, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 19.733333333333334, |
|
"grad_norm": 3.161210775375366, |
|
"learning_rate": 3e-05, |
|
"loss": 0.777, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 19.946666666666665, |
|
"eval_accuracy": 0.5851838565022421, |
|
"eval_loss": 2.2508349418640137, |
|
"eval_runtime": 5.6415, |
|
"eval_samples_per_second": 88.63, |
|
"eval_steps_per_second": 11.167, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 19.946666666666665, |
|
"step": 3740, |
|
"total_flos": 3.767212755417825e+17, |
|
"train_loss": 1.2655855785716663, |
|
"train_runtime": 8167.3443, |
|
"train_samples_per_second": 14.693, |
|
"train_steps_per_second": 0.458 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 3740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 3.767212755417825e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|