|
{ |
|
"best_metric": 0.6915614008903503, |
|
"best_model_checkpoint": "distilbert_lda_5_v1_rte/checkpoint-70", |
|
"epoch": 12.0, |
|
"eval_steps": 500, |
|
"global_step": 120, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5855181813240051, |
|
"learning_rate": 0.00098, |
|
"loss": 0.9449, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5270758122743683, |
|
"eval_loss": 0.7021801471710205, |
|
"eval_runtime": 0.1254, |
|
"eval_samples_per_second": 2209.06, |
|
"eval_steps_per_second": 15.95, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4455162584781647, |
|
"learning_rate": 0.00096, |
|
"loss": 0.7042, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4729241877256318, |
|
"eval_loss": 0.6999943852424622, |
|
"eval_runtime": 0.1299, |
|
"eval_samples_per_second": 2133.087, |
|
"eval_steps_per_second": 15.401, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.8011177182197571, |
|
"learning_rate": 0.00094, |
|
"loss": 0.7004, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5270758122743683, |
|
"eval_loss": 0.691927969455719, |
|
"eval_runtime": 0.1312, |
|
"eval_samples_per_second": 2111.997, |
|
"eval_steps_per_second": 15.249, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.39011821150779724, |
|
"learning_rate": 0.00092, |
|
"loss": 0.6962, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5270758122743683, |
|
"eval_loss": 0.6929433941841125, |
|
"eval_runtime": 0.1287, |
|
"eval_samples_per_second": 2152.24, |
|
"eval_steps_per_second": 15.54, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.33056044578552246, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 0.6965, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5270758122743683, |
|
"eval_loss": 0.6995571851730347, |
|
"eval_runtime": 0.1316, |
|
"eval_samples_per_second": 2104.975, |
|
"eval_steps_per_second": 15.198, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.3222827613353729, |
|
"learning_rate": 0.00088, |
|
"loss": 0.6965, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5270758122743683, |
|
"eval_loss": 0.6917728781700134, |
|
"eval_runtime": 0.129, |
|
"eval_samples_per_second": 2146.95, |
|
"eval_steps_per_second": 15.501, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.2771111726760864, |
|
"learning_rate": 0.00086, |
|
"loss": 0.6947, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5270758122743683, |
|
"eval_loss": 0.6915614008903503, |
|
"eval_runtime": 0.1383, |
|
"eval_samples_per_second": 2002.531, |
|
"eval_steps_per_second": 14.459, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.3152952194213867, |
|
"learning_rate": 0.00084, |
|
"loss": 0.6949, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4729241877256318, |
|
"eval_loss": 0.6938458681106567, |
|
"eval_runtime": 0.1301, |
|
"eval_samples_per_second": 2129.334, |
|
"eval_steps_per_second": 15.374, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.508091390132904, |
|
"learning_rate": 0.00082, |
|
"loss": 0.6948, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4729241877256318, |
|
"eval_loss": 0.699120044708252, |
|
"eval_runtime": 0.1297, |
|
"eval_samples_per_second": 2135.915, |
|
"eval_steps_per_second": 15.422, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.13915003836154938, |
|
"learning_rate": 0.0008, |
|
"loss": 0.6957, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5270758122743683, |
|
"eval_loss": 0.6919844150543213, |
|
"eval_runtime": 0.1283, |
|
"eval_samples_per_second": 2159.75, |
|
"eval_steps_per_second": 15.594, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.2669129967689514, |
|
"learning_rate": 0.0007800000000000001, |
|
"loss": 0.6941, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.4729241877256318, |
|
"eval_loss": 0.6955804228782654, |
|
"eval_runtime": 0.1281, |
|
"eval_samples_per_second": 2162.358, |
|
"eval_steps_per_second": 15.613, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.15157711505889893, |
|
"learning_rate": 0.00076, |
|
"loss": 0.6944, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4729241877256318, |
|
"eval_loss": 0.6934651136398315, |
|
"eval_runtime": 0.1305, |
|
"eval_samples_per_second": 2122.658, |
|
"eval_steps_per_second": 15.326, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"step": 120, |
|
"total_flos": 1979062935920640.0, |
|
"train_loss": 0.7172688285509745, |
|
"train_runtime": 49.8636, |
|
"train_samples_per_second": 2496.812, |
|
"train_steps_per_second": 10.027 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 5 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1979062935920640.0, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|