|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1675417125225067, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5895, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.1831749528646469, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4742, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.6748151382823872, |
|
"eval_loss": 1.531341314315796, |
|
"eval_runtime": 9.1549, |
|
"eval_samples_per_second": 54.616, |
|
"eval_steps_per_second": 6.882, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.22951605916023254, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4652, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.23647047579288483, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4601, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2248772829771042, |
|
"learning_rate": 3e-05, |
|
"loss": 1.45, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6756593886462882, |
|
"eval_loss": 1.5196012258529663, |
|
"eval_runtime": 9.0923, |
|
"eval_samples_per_second": 54.992, |
|
"eval_steps_per_second": 6.929, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.26751402020454407, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4361, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.29390770196914673, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4269, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6760756914119359, |
|
"eval_loss": 1.5133788585662842, |
|
"eval_runtime": 9.1109, |
|
"eval_samples_per_second": 54.879, |
|
"eval_steps_per_second": 6.915, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.3322733938694, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4184, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.3646068871021271, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3973, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4150474965572357, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3999, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.6762474526928676, |
|
"eval_loss": 1.511995792388916, |
|
"eval_runtime": 9.0922, |
|
"eval_samples_per_second": 54.992, |
|
"eval_steps_per_second": 6.929, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.44478657841682434, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3624, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.48866602778434753, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3614, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.6759650655021834, |
|
"eval_loss": 1.5192290544509888, |
|
"eval_runtime": 9.1143, |
|
"eval_samples_per_second": 54.859, |
|
"eval_steps_per_second": 6.912, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.5195505023002625, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3431, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.5769343972206116, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3264, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5181849598884583, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3303, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.6755080058224163, |
|
"eval_loss": 1.5265752077102661, |
|
"eval_runtime": 9.1234, |
|
"eval_samples_per_second": 54.804, |
|
"eval_steps_per_second": 6.905, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.6372528076171875, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2883, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.6501044034957886, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2946, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.6747045123726346, |
|
"eval_loss": 1.5446096658706665, |
|
"eval_runtime": 9.0797, |
|
"eval_samples_per_second": 55.068, |
|
"eval_steps_per_second": 6.939, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.7209816575050354, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2705, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 0.7495877742767334, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2498, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.681526780128479, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2518, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6745036390101893, |
|
"eval_loss": 1.5590205192565918, |
|
"eval_runtime": 8.0597, |
|
"eval_samples_per_second": 62.037, |
|
"eval_steps_per_second": 7.817, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.7470565438270569, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2196, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 0.7745229005813599, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2082, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.6740349344978166, |
|
"eval_loss": 1.571682333946228, |
|
"eval_runtime": 9.0961, |
|
"eval_samples_per_second": 54.969, |
|
"eval_steps_per_second": 6.926, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 0.8478706479072571, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2017, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 0.9340612292289734, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1742, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.9207039475440979, |
|
"learning_rate": 3e-05, |
|
"loss": 1.19, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.6727074235807861, |
|
"eval_loss": 1.6021865606307983, |
|
"eval_runtime": 9.0984, |
|
"eval_samples_per_second": 54.955, |
|
"eval_steps_per_second": 6.924, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 0.9244349598884583, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1299, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 0.9110805988311768, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1523, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.672608442503639, |
|
"eval_loss": 1.6098225116729736, |
|
"eval_runtime": 9.117, |
|
"eval_samples_per_second": 54.843, |
|
"eval_steps_per_second": 6.91, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 0.916287899017334, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1278, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"grad_norm": 1.0008416175842285, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0981, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.9763438701629639, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1193, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.671589519650655, |
|
"eval_loss": 1.6344681978225708, |
|
"eval_runtime": 9.1836, |
|
"eval_samples_per_second": 54.445, |
|
"eval_steps_per_second": 6.86, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 1.0682412385940552, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0604, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 1.1306017637252808, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0736, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.6707016011644833, |
|
"eval_loss": 1.674833059310913, |
|
"eval_runtime": 8.098, |
|
"eval_samples_per_second": 61.744, |
|
"eval_steps_per_second": 7.78, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"grad_norm": 1.154135823249817, |
|
"learning_rate": 3e-05, |
|
"loss": 1.054, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 1.1169854402542114, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0253, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 1.0877137184143066, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0414, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.6701280931586608, |
|
"eval_loss": 1.688016653060913, |
|
"eval_runtime": 9.1712, |
|
"eval_samples_per_second": 54.519, |
|
"eval_steps_per_second": 6.869, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 1.2245118618011475, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9823, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"grad_norm": 1.2784464359283447, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0069, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.6693682678311499, |
|
"eval_loss": 1.7182435989379883, |
|
"eval_runtime": 9.1433, |
|
"eval_samples_per_second": 54.685, |
|
"eval_steps_per_second": 6.89, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"grad_norm": 1.182626724243164, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9834, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 1.3419315814971924, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9608, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 1.2352997064590454, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9654, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.6685036390101893, |
|
"eval_loss": 1.7521902322769165, |
|
"eval_runtime": 9.1045, |
|
"eval_samples_per_second": 54.918, |
|
"eval_steps_per_second": 6.92, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 1.439382791519165, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9236, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"grad_norm": 1.3681950569152832, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9337, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.6677409024745269, |
|
"eval_loss": 1.7825894355773926, |
|
"eval_runtime": 9.1188, |
|
"eval_samples_per_second": 54.832, |
|
"eval_steps_per_second": 6.909, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"grad_norm": 1.417385458946228, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9135, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 1.5673705339431763, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8941, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 1.5578211545944214, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.6671615720524018, |
|
"eval_loss": 1.8080195188522339, |
|
"eval_runtime": 8.0689, |
|
"eval_samples_per_second": 61.967, |
|
"eval_steps_per_second": 7.808, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"grad_norm": 1.671644687652588, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8522, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"grad_norm": 1.5266335010528564, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8704, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.6663289665211063, |
|
"eval_loss": 1.8349848985671997, |
|
"eval_runtime": 9.1076, |
|
"eval_samples_per_second": 54.899, |
|
"eval_steps_per_second": 6.917, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 1.5969411134719849, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8469, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 19.6, |
|
"grad_norm": 1.5390815734863281, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8398, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 1.5651904344558716, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8407, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.6657583697234353, |
|
"eval_loss": 1.8696147203445435, |
|
"eval_runtime": 9.1023, |
|
"eval_samples_per_second": 54.931, |
|
"eval_steps_per_second": 6.921, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 5000, |
|
"total_flos": 9.221411586147615e+17, |
|
"train_loss": 1.163707649230957, |
|
"train_runtime": 11749.7477, |
|
"train_samples_per_second": 13.617, |
|
"train_steps_per_second": 0.426 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 9.221411586147615e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|