|
{ |
|
"best_metric": 6.85207986831665, |
|
"best_model_checkpoint": "./results/models/mistral-dna/checkpoint-18333", |
|
"epoch": 7.0, |
|
"eval_steps": 500, |
|
"global_step": 18333, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.19091256204658266, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 0.003984726995036274, |
|
"loss": 6.9779, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3818251240931653, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 0.003969453990072547, |
|
"loss": 6.8377, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.572737686139748, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.003954180985108821, |
|
"loss": 6.7842, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7636502481863306, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 0.003938907980145094, |
|
"loss": 6.8022, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9545628102329133, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.003923634975181367, |
|
"loss": 6.8364, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 6.887508392333984, |
|
"eval_runtime": 0.7694, |
|
"eval_samples_per_second": 109.176, |
|
"eval_steps_per_second": 3.899, |
|
"step": 2619 |
|
}, |
|
{ |
|
"epoch": 1.145475372279496, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0039083619702176406, |
|
"loss": 6.8965, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3363879343260787, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.003893088965253914, |
|
"loss": 6.9056, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.5273004963726613, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.003877815960290187, |
|
"loss": 6.8995, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.718213058419244, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.003862542955326461, |
|
"loss": 6.8937, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.9091256204658267, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 0.0038472699503627338, |
|
"loss": 6.8934, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 6.88522481918335, |
|
"eval_runtime": 0.7793, |
|
"eval_samples_per_second": 107.783, |
|
"eval_steps_per_second": 3.849, |
|
"step": 5238 |
|
}, |
|
{ |
|
"epoch": 2.1000381825124093, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 0.003831996945399007, |
|
"loss": 6.8913, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.290950744558992, |
|
"grad_norm": 8.75, |
|
"learning_rate": 0.003816723940435281, |
|
"loss": 6.8984, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.4818633066055744, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 0.003801450935471554, |
|
"loss": 6.8961, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.6727758686521574, |
|
"grad_norm": 4.375, |
|
"learning_rate": 0.003786177930507828, |
|
"loss": 6.8904, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.86368843069874, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.003770904925544101, |
|
"loss": 6.8954, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 6.886116981506348, |
|
"eval_runtime": 0.7695, |
|
"eval_samples_per_second": 109.165, |
|
"eval_steps_per_second": 3.899, |
|
"step": 7857 |
|
}, |
|
{ |
|
"epoch": 3.0546009927453226, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.0037556319205803742, |
|
"loss": 6.891, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.245513554791905, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.0037403589156166477, |
|
"loss": 6.8839, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.436426116838488, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 0.003725085910652921, |
|
"loss": 6.8833, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.6273386788850708, |
|
"grad_norm": 8.5, |
|
"learning_rate": 0.0037098129056891945, |
|
"loss": 6.8767, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.8182512409316534, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 0.003694539900725468, |
|
"loss": 6.8793, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 6.874268531799316, |
|
"eval_runtime": 0.7764, |
|
"eval_samples_per_second": 108.196, |
|
"eval_steps_per_second": 3.864, |
|
"step": 10476 |
|
}, |
|
{ |
|
"epoch": 4.009163802978236, |
|
"grad_norm": 8.875, |
|
"learning_rate": 0.003679266895761741, |
|
"loss": 6.8786, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.2000763650248185, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.0036639938907980147, |
|
"loss": 6.874, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.390988927071401, |
|
"grad_norm": 5.5, |
|
"learning_rate": 0.003648720885834288, |
|
"loss": 6.8756, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 4.581901489117984, |
|
"grad_norm": 19.5, |
|
"learning_rate": 0.0036334478808705615, |
|
"loss": 6.8751, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.772814051164566, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 0.003618174875906835, |
|
"loss": 6.877, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.963726613211149, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 0.003602901870943108, |
|
"loss": 6.8734, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 6.8629889488220215, |
|
"eval_runtime": 0.767, |
|
"eval_samples_per_second": 109.512, |
|
"eval_steps_per_second": 3.911, |
|
"step": 13095 |
|
}, |
|
{ |
|
"epoch": 5.154639175257732, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 0.0035876288659793818, |
|
"loss": 6.8751, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 5.345551737304315, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 0.0035723558610156548, |
|
"loss": 6.8771, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 5.5364642993508975, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.003557082856051928, |
|
"loss": 6.8746, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 5.72737686139748, |
|
"grad_norm": 5.875, |
|
"learning_rate": 0.0035418098510882016, |
|
"loss": 6.8753, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 5.918289423444063, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 0.003526536846124475, |
|
"loss": 6.8791, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 6.864416122436523, |
|
"eval_runtime": 0.7754, |
|
"eval_samples_per_second": 108.328, |
|
"eval_steps_per_second": 3.869, |
|
"step": 15714 |
|
}, |
|
{ |
|
"epoch": 6.109201985490645, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 0.0035112638411607484, |
|
"loss": 6.8699, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 6.300114547537228, |
|
"grad_norm": 4.375, |
|
"learning_rate": 0.003495990836197022, |
|
"loss": 6.8667, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 6.49102710958381, |
|
"grad_norm": 11.5, |
|
"learning_rate": 0.0034807178312332952, |
|
"loss": 6.8655, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 6.681939671630393, |
|
"grad_norm": 6.125, |
|
"learning_rate": 0.0034654448262695686, |
|
"loss": 6.8647, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 6.872852233676976, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 0.003450171821305842, |
|
"loss": 6.8588, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 6.85207986831665, |
|
"eval_runtime": 0.768, |
|
"eval_samples_per_second": 109.374, |
|
"eval_steps_per_second": 3.906, |
|
"step": 18333 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 130950, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.6896228473744097e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|