|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4528, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.022084805653710248, |
|
"grad_norm": 0.1020389199256897, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.5011, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044169611307420496, |
|
"grad_norm": 0.1910451352596283, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.4789, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06625441696113074, |
|
"grad_norm": 0.26337745785713196, |
|
"learning_rate": 2e-05, |
|
"loss": 2.4291, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08833922261484099, |
|
"grad_norm": 0.3770335614681244, |
|
"learning_rate": 1.9972406933597812e-05, |
|
"loss": 2.4186, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11042402826855123, |
|
"grad_norm": 0.36907321214675903, |
|
"learning_rate": 1.988978000985394e-05, |
|
"loss": 2.3774, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13250883392226148, |
|
"grad_norm": 0.4569687247276306, |
|
"learning_rate": 1.9752575214807077e-05, |
|
"loss": 2.3992, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15459363957597172, |
|
"grad_norm": 0.4629022777080536, |
|
"learning_rate": 1.9561549728661312e-05, |
|
"loss": 2.325, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.17667844522968199, |
|
"grad_norm": 0.47432681918144226, |
|
"learning_rate": 1.9317757747201386e-05, |
|
"loss": 2.3013, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.19876325088339222, |
|
"grad_norm": 0.46376603841781616, |
|
"learning_rate": 1.9022544664093854e-05, |
|
"loss": 2.3277, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22084805653710246, |
|
"grad_norm": 0.48484688997268677, |
|
"learning_rate": 1.8677539646179706e-05, |
|
"loss": 2.314, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.24293286219081273, |
|
"grad_norm": 0.655317485332489, |
|
"learning_rate": 1.828464664273263e-05, |
|
"loss": 2.3047, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.26501766784452296, |
|
"grad_norm": 0.520976722240448, |
|
"learning_rate": 1.7846033878299232e-05, |
|
"loss": 2.2971, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2871024734982332, |
|
"grad_norm": 0.502010703086853, |
|
"learning_rate": 1.7364121887106285e-05, |
|
"loss": 2.3102, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.30918727915194344, |
|
"grad_norm": 0.5849918127059937, |
|
"learning_rate": 1.684157015506839e-05, |
|
"loss": 2.2636, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.33127208480565373, |
|
"grad_norm": 0.6814427375793457, |
|
"learning_rate": 1.628126244311369e-05, |
|
"loss": 2.2854, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.35335689045936397, |
|
"grad_norm": 0.4844237267971039, |
|
"learning_rate": 1.5686290872822504e-05, |
|
"loss": 2.294, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3754416961130742, |
|
"grad_norm": 0.6080675721168518, |
|
"learning_rate": 1.5059938862204126e-05, |
|
"loss": 2.2631, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.39752650176678445, |
|
"grad_norm": 0.561839759349823, |
|
"learning_rate": 1.440566300578259e-05, |
|
"loss": 2.2489, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4196113074204947, |
|
"grad_norm": 0.5733577609062195, |
|
"learning_rate": 1.3727073998988202e-05, |
|
"loss": 2.2505, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4416961130742049, |
|
"grad_norm": 0.6440421342849731, |
|
"learning_rate": 1.3027916712125825e-05, |
|
"loss": 2.2341, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.4637809187279152, |
|
"grad_norm": 0.5807141661643982, |
|
"learning_rate": 1.2312049523883851e-05, |
|
"loss": 2.2842, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.48586572438162545, |
|
"grad_norm": 0.6154150366783142, |
|
"learning_rate": 1.1583423028434343e-05, |
|
"loss": 2.2756, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5079505300353356, |
|
"grad_norm": 0.8887423276901245, |
|
"learning_rate": 1.0846058233631565e-05, |
|
"loss": 2.2388, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5300353356890459, |
|
"grad_norm": 0.6062945127487183, |
|
"learning_rate": 1.0104024370624644e-05, |
|
"loss": 2.2708, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5521201413427562, |
|
"grad_norm": 0.7383410930633545, |
|
"learning_rate": 9.361416437344504e-06, |
|
"loss": 2.2693, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5742049469964664, |
|
"grad_norm": 0.7274787425994873, |
|
"learning_rate": 8.622332599793906e-06, |
|
"loss": 2.2596, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5962897526501767, |
|
"grad_norm": 0.6965683102607727, |
|
"learning_rate": 7.890851575854108e-06, |
|
"loss": 2.2652, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6183745583038869, |
|
"grad_norm": 0.6714246273040771, |
|
"learning_rate": 7.171010126418218e-06, |
|
"loss": 2.2587, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6404593639575972, |
|
"grad_norm": 0.7770031690597534, |
|
"learning_rate": 6.466780778068903e-06, |
|
"loss": 2.2502, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6625441696113075, |
|
"grad_norm": 0.7164767980575562, |
|
"learning_rate": 5.782049900240432e-06, |
|
"loss": 2.2477, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6846289752650176, |
|
"grad_norm": 0.7085398435592651, |
|
"learning_rate": 5.120596257848716e-06, |
|
"loss": 2.268, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7067137809187279, |
|
"grad_norm": 0.7723512053489685, |
|
"learning_rate": 4.486070157749059e-06, |
|
"loss": 2.2363, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7287985865724381, |
|
"grad_norm": 0.6914446353912354, |
|
"learning_rate": 3.881973304104252e-06, |
|
"loss": 2.2511, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7508833922261484, |
|
"grad_norm": 0.6801828742027283, |
|
"learning_rate": 3.311639473833487e-06, |
|
"loss": 2.2672, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.7729681978798587, |
|
"grad_norm": 0.6551490426063538, |
|
"learning_rate": 2.778216118786782e-06, |
|
"loss": 2.2574, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7950530035335689, |
|
"grad_norm": 0.7881369590759277, |
|
"learning_rate": 2.2846469961753916e-06, |
|
"loss": 2.2728, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8171378091872792, |
|
"grad_norm": 0.6549363136291504, |
|
"learning_rate": 1.8336559231141726e-06, |
|
"loss": 2.2253, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8392226148409894, |
|
"grad_norm": 0.6901439428329468, |
|
"learning_rate": 1.4277317449282834e-06, |
|
"loss": 2.2392, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.8613074204946997, |
|
"grad_norm": 0.6845853924751282, |
|
"learning_rate": 1.0691146001783081e-06, |
|
"loss": 2.2545, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.8833922261484098, |
|
"grad_norm": 0.7686012387275696, |
|
"learning_rate": 7.597835582018586e-07, |
|
"loss": 2.2374, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9054770318021201, |
|
"grad_norm": 0.6920452117919922, |
|
"learning_rate": 5.014456973952375e-07, |
|
"loss": 2.249, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9275618374558304, |
|
"grad_norm": 0.5871976613998413, |
|
"learning_rate": 2.9552668450792965e-07, |
|
"loss": 2.2629, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.9496466431095406, |
|
"grad_norm": 0.7372850179672241, |
|
"learning_rate": 1.431629069391516e-07, |
|
"loss": 2.2742, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.9717314487632509, |
|
"grad_norm": 0.7272618412971497, |
|
"learning_rate": 4.519520145525369e-08, |
|
"loss": 2.238, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.9938162544169611, |
|
"grad_norm": 0.6909335851669312, |
|
"learning_rate": 2.164213936770576e-09, |
|
"loss": 2.2434, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 4528, |
|
"total_flos": 8.2279274151936e+16, |
|
"train_loss": 2.2892676124303164, |
|
"train_runtime": 1413.0668, |
|
"train_samples_per_second": 6.408, |
|
"train_steps_per_second": 3.204 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 4528, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.2279274151936e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|