|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.03931976801336872, |
|
"eval_steps": 13, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007863953602673744, |
|
"grad_norm": 1.727420449256897, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2759, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0007863953602673744, |
|
"eval_loss": 2.9491188526153564, |
|
"eval_runtime": 802.2962, |
|
"eval_samples_per_second": 2.67, |
|
"eval_steps_per_second": 1.335, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0015727907205347487, |
|
"grad_norm": 2.06775164604187, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2877, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0023591860808021233, |
|
"grad_norm": 2.407663583755493, |
|
"learning_rate": 3e-05, |
|
"loss": 1.367, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0031455814410694975, |
|
"grad_norm": 1.9768824577331543, |
|
"learning_rate": 4e-05, |
|
"loss": 1.2932, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003931976801336872, |
|
"grad_norm": 2.2676198482513428, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4012, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004718372161604247, |
|
"grad_norm": 2.2389116287231445, |
|
"learning_rate": 6e-05, |
|
"loss": 1.4455, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005504767521871621, |
|
"grad_norm": 3.368894577026367, |
|
"learning_rate": 7e-05, |
|
"loss": 1.6058, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006291162882138995, |
|
"grad_norm": 2.1887245178222656, |
|
"learning_rate": 8e-05, |
|
"loss": 1.315, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00707755824240637, |
|
"grad_norm": 2.749788284301758, |
|
"learning_rate": 9e-05, |
|
"loss": 1.4487, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.007863953602673744, |
|
"grad_norm": 2.44626784324646, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9548, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008650348962941118, |
|
"grad_norm": 2.37338924407959, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 0.8456, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.009436744323208493, |
|
"grad_norm": 1.9337718486785889, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 0.8131, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.010223139683475867, |
|
"grad_norm": 5.305741786956787, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 1.3641, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.010223139683475867, |
|
"eval_loss": 1.1911835670471191, |
|
"eval_runtime": 790.9924, |
|
"eval_samples_per_second": 2.708, |
|
"eval_steps_per_second": 1.354, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011009535043743243, |
|
"grad_norm": 1.8393322229385376, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.8994, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.011795930404010616, |
|
"grad_norm": 2.288557529449463, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 0.964, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01258232576427799, |
|
"grad_norm": 2.239872455596924, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 0.8643, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.013368721124545365, |
|
"grad_norm": 2.4298009872436523, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 0.9466, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01415511648481274, |
|
"grad_norm": 3.0432651042938232, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.0756, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.014941511845080115, |
|
"grad_norm": 1.3065857887268066, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 0.6739, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01572790720534749, |
|
"grad_norm": 2.0720527172088623, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.8697, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.016514302565614862, |
|
"grad_norm": 1.8250558376312256, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 0.7318, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.017300697925882236, |
|
"grad_norm": 2.166428565979004, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.7315, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.018087093286149613, |
|
"grad_norm": 2.2534689903259277, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 0.9042, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.018873488646416987, |
|
"grad_norm": 1.4026483297348022, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 0.7183, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01965988400668436, |
|
"grad_norm": 1.6705015897750854, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.6787, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.020446279366951734, |
|
"grad_norm": 2.1395044326782227, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.9407, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.020446279366951734, |
|
"eval_loss": 0.8785327672958374, |
|
"eval_runtime": 781.2326, |
|
"eval_samples_per_second": 2.742, |
|
"eval_steps_per_second": 1.371, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.021232674727219108, |
|
"grad_norm": 1.2242342233657837, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 0.6702, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.022019070087486485, |
|
"grad_norm": 1.7602314949035645, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 0.9246, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.02280546544775386, |
|
"grad_norm": 2.0709216594696045, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 0.8103, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.023591860808021232, |
|
"grad_norm": 1.5635908842086792, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8027, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.024378256168288606, |
|
"grad_norm": 1.6824203729629517, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 0.776, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.02516465152855598, |
|
"grad_norm": 1.457542061805725, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 0.5255, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.025951046888823357, |
|
"grad_norm": 1.5966757535934448, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 0.6032, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02673744224909073, |
|
"grad_norm": 1.5422358512878418, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.548, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.027523837609358105, |
|
"grad_norm": 1.623889446258545, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 0.7702, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02831023296962548, |
|
"grad_norm": 1.665121078491211, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 0.6063, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.029096628329892852, |
|
"grad_norm": 1.3971141576766968, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 0.48, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.02988302369016023, |
|
"grad_norm": 1.3747204542160034, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.6483, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.030669419050427603, |
|
"grad_norm": 1.3398278951644897, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 0.7969, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.030669419050427603, |
|
"eval_loss": 0.8240298628807068, |
|
"eval_runtime": 792.48, |
|
"eval_samples_per_second": 2.703, |
|
"eval_steps_per_second": 1.351, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03145581441069498, |
|
"grad_norm": 1.3485429286956787, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.6484, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03224220977096235, |
|
"grad_norm": 1.2892612218856812, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 0.7116, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.033028605131229724, |
|
"grad_norm": 1.8880107402801514, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.5925, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0338150004914971, |
|
"grad_norm": 1.4518492221832275, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 0.6824, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03460139585176447, |
|
"grad_norm": 2.3983242511749268, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 0.9552, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03538779121203185, |
|
"grad_norm": 1.543707013130188, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 0.8833, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.036174186572299226, |
|
"grad_norm": 1.7945361137390137, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 0.8763, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0369605819325666, |
|
"grad_norm": 1.6415727138519287, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 0.7373, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03774697729283397, |
|
"grad_norm": 1.7929890155792236, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 0.6606, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03853337265310135, |
|
"grad_norm": 1.3461647033691406, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 0.7808, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03931976801336872, |
|
"grad_norm": 1.1689928770065308, |
|
"learning_rate": 0.0, |
|
"loss": 0.7459, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.067577446170624e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|