|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.992, |
|
"eval_steps": 16, |
|
"global_step": 62, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.4910566806793213, |
|
"learning_rate": 1e-05, |
|
"loss": 88.7097, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"eval_loss": 11.088533401489258, |
|
"eval_runtime": 1.124, |
|
"eval_samples_per_second": 94.304, |
|
"eval_steps_per_second": 24.021, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.4108612537384033, |
|
"learning_rate": 2e-05, |
|
"loss": 88.7293, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.486616849899292, |
|
"learning_rate": 3e-05, |
|
"loss": 88.7434, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.7622555494308472, |
|
"learning_rate": 4e-05, |
|
"loss": 88.7179, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5639805793762207, |
|
"learning_rate": 5e-05, |
|
"loss": 88.6803, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.7514230012893677, |
|
"learning_rate": 6e-05, |
|
"loss": 88.7313, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 1.9987614154815674, |
|
"learning_rate": 7e-05, |
|
"loss": 88.7164, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 1.940069317817688, |
|
"learning_rate": 8e-05, |
|
"loss": 88.7398, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.8506841659545898, |
|
"learning_rate": 9e-05, |
|
"loss": 88.6965, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.9289461374282837, |
|
"learning_rate": 0.0001, |
|
"loss": 88.8021, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.9998689889907837, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 88.6776, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 2.1637938022613525, |
|
"learning_rate": 0.00012, |
|
"loss": 88.7963, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 2.258723497390747, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 88.7175, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 2.3891043663024902, |
|
"learning_rate": 0.00014, |
|
"loss": 88.6413, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.612598180770874, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 88.7464, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 2.633556365966797, |
|
"learning_rate": 0.00016, |
|
"loss": 88.7045, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"eval_loss": 11.084848403930664, |
|
"eval_runtime": 0.8206, |
|
"eval_samples_per_second": 129.18, |
|
"eval_steps_per_second": 32.904, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.444272756576538, |
|
"learning_rate": 0.00017, |
|
"loss": 88.719, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.4536869525909424, |
|
"learning_rate": 0.00018, |
|
"loss": 88.7756, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.6990916728973389, |
|
"learning_rate": 0.00019, |
|
"loss": 88.719, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.6907556056976318, |
|
"learning_rate": 0.0002, |
|
"loss": 88.6939, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.7854983806610107, |
|
"learning_rate": 0.00019972037971811802, |
|
"loss": 88.6928, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.7495782375335693, |
|
"learning_rate": 0.00019888308262251285, |
|
"loss": 88.7139, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.8135170936584473, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 88.6808, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.9571936130523682, |
|
"learning_rate": 0.0001955572805786141, |
|
"loss": 88.7278, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.9375219345092773, |
|
"learning_rate": 0.00019308737486442045, |
|
"loss": 88.661, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 2.0492427349090576, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 88.6735, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 2.057321786880493, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 88.5661, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 2.227236747741699, |
|
"learning_rate": 0.0001826238774315995, |
|
"loss": 88.6781, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 2.3857967853546143, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 88.6794, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.7329936027526855, |
|
"learning_rate": 0.00017330518718298264, |
|
"loss": 88.6323, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 3.2216603755950928, |
|
"learning_rate": 0.00016801727377709194, |
|
"loss": 88.5737, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 1.5606772899627686, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 88.6774, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"eval_loss": 11.075674057006836, |
|
"eval_runtime": 0.8047, |
|
"eval_samples_per_second": 131.72, |
|
"eval_steps_per_second": 33.551, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.6445367336273193, |
|
"learning_rate": 0.0001563320058063622, |
|
"loss": 88.6368, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.6764920949935913, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 88.6352, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.6223012208938599, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 88.6562, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.6213066577911377, |
|
"learning_rate": 0.00013653410243663952, |
|
"loss": 88.6551, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.6623308658599854, |
|
"learning_rate": 0.00012947551744109043, |
|
"loss": 88.6895, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 1.8002474308013916, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 88.674, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.8450833559036255, |
|
"learning_rate": 0.00011490422661761744, |
|
"loss": 88.5731, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.9053994417190552, |
|
"learning_rate": 0.00010747300935864243, |
|
"loss": 88.6383, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 1.9040968418121338, |
|
"learning_rate": 0.0001, |
|
"loss": 88.6602, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 2.163938522338867, |
|
"learning_rate": 9.252699064135758e-05, |
|
"loss": 88.5704, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 2.156372547149658, |
|
"learning_rate": 8.509577338238255e-05, |
|
"loss": 88.4837, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 2.1990439891815186, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 88.6662, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.474072217941284, |
|
"learning_rate": 7.052448255890957e-05, |
|
"loss": 88.5809, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 2.7647547721862793, |
|
"learning_rate": 6.34658975633605e-05, |
|
"loss": 88.6076, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 2.5142226219177246, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 88.7863, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.5332900285720825, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 88.6006, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"eval_loss": 11.07020378112793, |
|
"eval_runtime": 0.612, |
|
"eval_samples_per_second": 173.197, |
|
"eval_steps_per_second": 44.116, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.6119565963745117, |
|
"learning_rate": 4.3667994193637796e-05, |
|
"loss": 88.609, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.591171145439148, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 88.6227, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.7413899898529053, |
|
"learning_rate": 3.198272622290804e-05, |
|
"loss": 88.6242, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.8746490478515625, |
|
"learning_rate": 2.669481281701739e-05, |
|
"loss": 88.5348, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 1.946791172027588, |
|
"learning_rate": 2.181685175319702e-05, |
|
"loss": 88.5242, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.9902241230010986, |
|
"learning_rate": 1.7376122568400532e-05, |
|
"loss": 88.6423, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.9461379051208496, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 88.6454, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 2.044666290283203, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 88.5542, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.9682707786560059, |
|
"learning_rate": 6.9126251355795864e-06, |
|
"loss": 88.6255, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 2.2248926162719727, |
|
"learning_rate": 4.442719421385922e-06, |
|
"loss": 88.5276, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 2.1064934730529785, |
|
"learning_rate": 2.5072087818176382e-06, |
|
"loss": 88.6175, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.4718239307403564, |
|
"learning_rate": 1.1169173774871478e-06, |
|
"loss": 88.5009, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 2.4647696018218994, |
|
"learning_rate": 2.7962028188198706e-07, |
|
"loss": 88.6283, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 3.2278006076812744, |
|
"learning_rate": 0.0, |
|
"loss": 88.4972, |
|
"step": 62 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 62, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 16, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1856734494720.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|