|
{ |
|
"best_metric": 0.0, |
|
"best_model_checkpoint": "hiera_model/checkpoint-283", |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 283, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0353356890459364, |
|
"grad_norm": 4.513326644897461, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 7.6619, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0706713780918728, |
|
"grad_norm": 5.347131729125977, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 7.6612, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10600706713780919, |
|
"grad_norm": 7.650573253631592, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 7.6148, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1413427561837456, |
|
"grad_norm": 6.892947196960449, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 7.532, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17667844522968199, |
|
"grad_norm": 7.388574600219727, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 7.4152, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21201413427561838, |
|
"grad_norm": 8.62385368347168, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 7.4114, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24734982332155478, |
|
"grad_norm": 10.373017311096191, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 7.3113, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2826855123674912, |
|
"grad_norm": 6.3279500007629395, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 7.2527, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31802120141342755, |
|
"grad_norm": 5.869943618774414, |
|
"learning_rate": 4.967277486910995e-05, |
|
"loss": 7.2645, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.35335689045936397, |
|
"grad_norm": 5.0281453132629395, |
|
"learning_rate": 4.9018324607329844e-05, |
|
"loss": 7.1714, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.38869257950530034, |
|
"grad_norm": 7.027573585510254, |
|
"learning_rate": 4.836387434554974e-05, |
|
"loss": 7.1634, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.42402826855123676, |
|
"grad_norm": 4.754793167114258, |
|
"learning_rate": 4.770942408376964e-05, |
|
"loss": 7.1112, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.45936395759717313, |
|
"grad_norm": 6.395844459533691, |
|
"learning_rate": 4.7054973821989526e-05, |
|
"loss": 7.0364, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.49469964664310956, |
|
"grad_norm": 5.85114049911499, |
|
"learning_rate": 4.6400523560209424e-05, |
|
"loss": 7.0286, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5300353356890459, |
|
"grad_norm": 5.452080249786377, |
|
"learning_rate": 4.574607329842932e-05, |
|
"loss": 7.1043, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5653710247349824, |
|
"grad_norm": 5.287403583526611, |
|
"learning_rate": 4.5091623036649215e-05, |
|
"loss": 6.9891, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6007067137809188, |
|
"grad_norm": 5.20114803314209, |
|
"learning_rate": 4.4437172774869113e-05, |
|
"loss": 6.9994, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6360424028268551, |
|
"grad_norm": 5.050561428070068, |
|
"learning_rate": 4.3782722513089005e-05, |
|
"loss": 6.9812, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6713780918727915, |
|
"grad_norm": 4.857853412628174, |
|
"learning_rate": 4.3128272251308904e-05, |
|
"loss": 7.1296, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7067137809187279, |
|
"grad_norm": 4.78601598739624, |
|
"learning_rate": 4.24738219895288e-05, |
|
"loss": 7.0627, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7420494699646644, |
|
"grad_norm": 7.134556293487549, |
|
"learning_rate": 4.181937172774869e-05, |
|
"loss": 7.0056, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7773851590106007, |
|
"grad_norm": 5.701265811920166, |
|
"learning_rate": 4.1164921465968586e-05, |
|
"loss": 6.9878, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8127208480565371, |
|
"grad_norm": 5.249512672424316, |
|
"learning_rate": 4.0510471204188485e-05, |
|
"loss": 6.9481, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8480565371024735, |
|
"grad_norm": 5.31380558013916, |
|
"learning_rate": 3.985602094240838e-05, |
|
"loss": 6.9556, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8833922261484098, |
|
"grad_norm": 5.351413726806641, |
|
"learning_rate": 3.9201570680628275e-05, |
|
"loss": 6.9064, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9187279151943463, |
|
"grad_norm": 5.594610214233398, |
|
"learning_rate": 3.8547120418848174e-05, |
|
"loss": 6.934, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9540636042402827, |
|
"grad_norm": 8.065481185913086, |
|
"learning_rate": 3.7892670157068066e-05, |
|
"loss": 6.8646, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9893992932862191, |
|
"grad_norm": 6.58479642868042, |
|
"learning_rate": 3.7238219895287964e-05, |
|
"loss": 6.9557, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.0, |
|
"eval_loss": 8.122916221618652, |
|
"eval_runtime": 9.1372, |
|
"eval_samples_per_second": 10.944, |
|
"eval_steps_per_second": 0.219, |
|
"step": 283 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 849, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 4.651092260493558e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|