Mistral-DNA-v1-417M-Athaliana / trainer_state.json
RaphaelMourad's picture
Upload 9 files
aa424b3 verified
{
"best_metric": 6.85207986831665,
"best_model_checkpoint": "./results/models/mistral-dna/checkpoint-18333",
"epoch": 7.0,
"eval_steps": 500,
"global_step": 18333,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.19091256204658266,
"grad_norm": 0.0634765625,
"learning_rate": 0.003984726995036274,
"loss": 6.9779,
"step": 500
},
{
"epoch": 0.3818251240931653,
"grad_norm": 0.09716796875,
"learning_rate": 0.003969453990072547,
"loss": 6.8377,
"step": 1000
},
{
"epoch": 0.572737686139748,
"grad_norm": 0.07177734375,
"learning_rate": 0.003954180985108821,
"loss": 6.7842,
"step": 1500
},
{
"epoch": 0.7636502481863306,
"grad_norm": 0.1025390625,
"learning_rate": 0.003938907980145094,
"loss": 6.8022,
"step": 2000
},
{
"epoch": 0.9545628102329133,
"grad_norm": 1.4921875,
"learning_rate": 0.003923634975181367,
"loss": 6.8364,
"step": 2500
},
{
"epoch": 1.0,
"eval_loss": 6.887508392333984,
"eval_runtime": 0.7694,
"eval_samples_per_second": 109.176,
"eval_steps_per_second": 3.899,
"step": 2619
},
{
"epoch": 1.145475372279496,
"grad_norm": 1.515625,
"learning_rate": 0.0039083619702176406,
"loss": 6.8965,
"step": 3000
},
{
"epoch": 1.3363879343260787,
"grad_norm": 1.859375,
"learning_rate": 0.003893088965253914,
"loss": 6.9056,
"step": 3500
},
{
"epoch": 1.5273004963726613,
"grad_norm": 2.28125,
"learning_rate": 0.003877815960290187,
"loss": 6.8995,
"step": 4000
},
{
"epoch": 1.718213058419244,
"grad_norm": 1.8984375,
"learning_rate": 0.003862542955326461,
"loss": 6.8937,
"step": 4500
},
{
"epoch": 1.9091256204658267,
"grad_norm": 3.859375,
"learning_rate": 0.0038472699503627338,
"loss": 6.8934,
"step": 5000
},
{
"epoch": 2.0,
"eval_loss": 6.88522481918335,
"eval_runtime": 0.7793,
"eval_samples_per_second": 107.783,
"eval_steps_per_second": 3.849,
"step": 5238
},
{
"epoch": 2.1000381825124093,
"grad_norm": 2.953125,
"learning_rate": 0.003831996945399007,
"loss": 6.8913,
"step": 5500
},
{
"epoch": 2.290950744558992,
"grad_norm": 8.75,
"learning_rate": 0.003816723940435281,
"loss": 6.8984,
"step": 6000
},
{
"epoch": 2.4818633066055744,
"grad_norm": 5.96875,
"learning_rate": 0.003801450935471554,
"loss": 6.8961,
"step": 6500
},
{
"epoch": 2.6727758686521574,
"grad_norm": 4.375,
"learning_rate": 0.003786177930507828,
"loss": 6.8904,
"step": 7000
},
{
"epoch": 2.86368843069874,
"grad_norm": 4.15625,
"learning_rate": 0.003770904925544101,
"loss": 6.8954,
"step": 7500
},
{
"epoch": 3.0,
"eval_loss": 6.886116981506348,
"eval_runtime": 0.7695,
"eval_samples_per_second": 109.165,
"eval_steps_per_second": 3.899,
"step": 7857
},
{
"epoch": 3.0546009927453226,
"grad_norm": 3.296875,
"learning_rate": 0.0037556319205803742,
"loss": 6.891,
"step": 8000
},
{
"epoch": 3.245513554791905,
"grad_norm": 2.9375,
"learning_rate": 0.0037403589156166477,
"loss": 6.8839,
"step": 8500
},
{
"epoch": 3.436426116838488,
"grad_norm": 3.0625,
"learning_rate": 0.003725085910652921,
"loss": 6.8833,
"step": 9000
},
{
"epoch": 3.6273386788850708,
"grad_norm": 8.5,
"learning_rate": 0.0037098129056891945,
"loss": 6.8767,
"step": 9500
},
{
"epoch": 3.8182512409316534,
"grad_norm": 3.484375,
"learning_rate": 0.003694539900725468,
"loss": 6.8793,
"step": 10000
},
{
"epoch": 4.0,
"eval_loss": 6.874268531799316,
"eval_runtime": 0.7764,
"eval_samples_per_second": 108.196,
"eval_steps_per_second": 3.864,
"step": 10476
},
{
"epoch": 4.009163802978236,
"grad_norm": 8.875,
"learning_rate": 0.003679266895761741,
"loss": 6.8786,
"step": 10500
},
{
"epoch": 4.2000763650248185,
"grad_norm": 3.71875,
"learning_rate": 0.0036639938907980147,
"loss": 6.874,
"step": 11000
},
{
"epoch": 4.390988927071401,
"grad_norm": 5.5,
"learning_rate": 0.003648720885834288,
"loss": 6.8756,
"step": 11500
},
{
"epoch": 4.581901489117984,
"grad_norm": 19.5,
"learning_rate": 0.0036334478808705615,
"loss": 6.8751,
"step": 12000
},
{
"epoch": 4.772814051164566,
"grad_norm": 5.84375,
"learning_rate": 0.003618174875906835,
"loss": 6.877,
"step": 12500
},
{
"epoch": 4.963726613211149,
"grad_norm": 3.46875,
"learning_rate": 0.003602901870943108,
"loss": 6.8734,
"step": 13000
},
{
"epoch": 5.0,
"eval_loss": 6.8629889488220215,
"eval_runtime": 0.767,
"eval_samples_per_second": 109.512,
"eval_steps_per_second": 3.911,
"step": 13095
},
{
"epoch": 5.154639175257732,
"grad_norm": 5.40625,
"learning_rate": 0.0035876288659793818,
"loss": 6.8751,
"step": 13500
},
{
"epoch": 5.345551737304315,
"grad_norm": 4.78125,
"learning_rate": 0.0035723558610156548,
"loss": 6.8771,
"step": 14000
},
{
"epoch": 5.5364642993508975,
"grad_norm": 4.8125,
"learning_rate": 0.003557082856051928,
"loss": 6.8746,
"step": 14500
},
{
"epoch": 5.72737686139748,
"grad_norm": 5.875,
"learning_rate": 0.0035418098510882016,
"loss": 6.8753,
"step": 15000
},
{
"epoch": 5.918289423444063,
"grad_norm": 5.3125,
"learning_rate": 0.003526536846124475,
"loss": 6.8791,
"step": 15500
},
{
"epoch": 6.0,
"eval_loss": 6.864416122436523,
"eval_runtime": 0.7754,
"eval_samples_per_second": 108.328,
"eval_steps_per_second": 3.869,
"step": 15714
},
{
"epoch": 6.109201985490645,
"grad_norm": 5.6875,
"learning_rate": 0.0035112638411607484,
"loss": 6.8699,
"step": 16000
},
{
"epoch": 6.300114547537228,
"grad_norm": 4.375,
"learning_rate": 0.003495990836197022,
"loss": 6.8667,
"step": 16500
},
{
"epoch": 6.49102710958381,
"grad_norm": 11.5,
"learning_rate": 0.0034807178312332952,
"loss": 6.8655,
"step": 17000
},
{
"epoch": 6.681939671630393,
"grad_norm": 6.125,
"learning_rate": 0.0034654448262695686,
"loss": 6.8647,
"step": 17500
},
{
"epoch": 6.872852233676976,
"grad_norm": 5.9375,
"learning_rate": 0.003450171821305842,
"loss": 6.8588,
"step": 18000
},
{
"epoch": 7.0,
"eval_loss": 6.85207986831665,
"eval_runtime": 0.768,
"eval_samples_per_second": 109.374,
"eval_steps_per_second": 3.906,
"step": 18333
}
],
"logging_steps": 500,
"max_steps": 130950,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.6896228473744097e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}