|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.963369963369964, |
|
"eval_steps": 500, |
|
"global_step": 680, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14652014652014653, |
|
"grad_norm": 0.301513671875, |
|
"learning_rate": 0.00019989329748023725, |
|
"loss": 0.8924, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.29304029304029305, |
|
"grad_norm": 0.32275390625, |
|
"learning_rate": 0.00019957341762950344, |
|
"loss": 0.8199, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001990410430875205, |
|
"loss": 0.8061, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5860805860805861, |
|
"grad_norm": 0.445068359375, |
|
"learning_rate": 0.0001982973099683902, |
|
"loss": 0.8483, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"grad_norm": 0.396728515625, |
|
"learning_rate": 0.0001973438054360693, |
|
"loss": 0.8471, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 0.35693359375, |
|
"learning_rate": 0.00019618256431728194, |
|
"loss": 0.8433, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.360595703125, |
|
"learning_rate": 0.0001948160647590966, |
|
"loss": 0.8381, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1721611721611722, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00019324722294043558, |
|
"loss": 0.7955, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0001914793868488021, |
|
"loss": 0.8182, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4652014652014653, |
|
"grad_norm": 0.45458984375, |
|
"learning_rate": 0.00018951632913550626, |
|
"loss": 0.7914, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6117216117216118, |
|
"grad_norm": 0.51025390625, |
|
"learning_rate": 0.00018736223906463696, |
|
"loss": 0.8114, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"grad_norm": 0.52001953125, |
|
"learning_rate": 0.00018502171357296144, |
|
"loss": 0.8434, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.433349609375, |
|
"learning_rate": 0.00018249974745983023, |
|
"loss": 0.7869, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.62255859375, |
|
"learning_rate": 0.000179801722728024, |
|
"loss": 0.7767, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 0.469970703125, |
|
"learning_rate": 0.00017693339709828792, |
|
"loss": 0.7711, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3443223443223444, |
|
"grad_norm": 0.447998046875, |
|
"learning_rate": 0.00017390089172206592, |
|
"loss": 0.7823, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.490842490842491, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.7747, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00016736956436465573, |
|
"loss": 0.7877, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.7838827838827838, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00016388468056519612, |
|
"loss": 0.7778, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.9304029304029307, |
|
"grad_norm": 0.47998046875, |
|
"learning_rate": 0.00016026346363792567, |
|
"loss": 0.754, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.50244140625, |
|
"learning_rate": 0.0001565136414422592, |
|
"loss": 0.8089, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.2234432234432235, |
|
"grad_norm": 0.475830078125, |
|
"learning_rate": 0.0001526432162877356, |
|
"loss": 0.75, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.36996336996337, |
|
"grad_norm": 0.464111328125, |
|
"learning_rate": 0.00014866044785668563, |
|
"loss": 0.762, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.5164835164835164, |
|
"grad_norm": 0.61474609375, |
|
"learning_rate": 0.00014457383557765386, |
|
"loss": 0.7423, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.663003663003663, |
|
"grad_norm": 0.4873046875, |
|
"learning_rate": 0.00014039210048718949, |
|
"loss": 0.764, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00013612416661871533, |
|
"loss": 0.7422, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.956043956043956, |
|
"grad_norm": 0.473388671875, |
|
"learning_rate": 0.00013177914195819016, |
|
"loss": 0.7702, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 0.476318359375, |
|
"learning_rate": 0.0001273662990072083, |
|
"loss": 0.7237, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.249084249084249, |
|
"grad_norm": 0.471923828125, |
|
"learning_rate": 0.0001228950549950134, |
|
"loss": 0.7334, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.395604395604396, |
|
"grad_norm": 0.50537109375, |
|
"learning_rate": 0.00011837495178165706, |
|
"loss": 0.7274, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.542124542124542, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00011381563549518823, |
|
"loss": 0.7142, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.688644688644689, |
|
"grad_norm": 0.48779296875, |
|
"learning_rate": 0.00010922683594633021, |
|
"loss": 0.7166, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.835164835164835, |
|
"grad_norm": 0.513671875, |
|
"learning_rate": 0.00010461834586457398, |
|
"loss": 0.76, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.981684981684982, |
|
"grad_norm": 0.5048828125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7571, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 0.5166015625, |
|
"learning_rate": 9.538165413542607e-05, |
|
"loss": 0.7046, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.274725274725275, |
|
"grad_norm": 0.51416015625, |
|
"learning_rate": 9.077316405366981e-05, |
|
"loss": 0.6916, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.4212454212454215, |
|
"grad_norm": 0.61279296875, |
|
"learning_rate": 8.61843645048118e-05, |
|
"loss": 0.7281, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.5677655677655675, |
|
"grad_norm": 0.54736328125, |
|
"learning_rate": 8.162504821834295e-05, |
|
"loss": 0.6963, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.6103515625, |
|
"learning_rate": 7.710494500498662e-05, |
|
"loss": 0.7323, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.860805860805861, |
|
"grad_norm": 0.60595703125, |
|
"learning_rate": 7.263370099279172e-05, |
|
"loss": 0.7342, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.007326007326007, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 6.822085804180984e-05, |
|
"loss": 0.7018, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 6.387583338128471e-05, |
|
"loss": 0.6998, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.3003663003663, |
|
"grad_norm": 0.591796875, |
|
"learning_rate": 5.960789951281052e-05, |
|
"loss": 0.6639, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.446886446886447, |
|
"grad_norm": 0.60107421875, |
|
"learning_rate": 5.542616442234618e-05, |
|
"loss": 0.7004, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.593406593406593, |
|
"grad_norm": 0.55810546875, |
|
"learning_rate": 5.1339552143314384e-05, |
|
"loss": 0.7155, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.73992673992674, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.735678371226441e-05, |
|
"loss": 0.7091, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 6.886446886446887, |
|
"grad_norm": 0.53662109375, |
|
"learning_rate": 4.3486358557740814e-05, |
|
"loss": 0.6888, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.032967032967033, |
|
"grad_norm": 0.5029296875, |
|
"learning_rate": 3.973653636207437e-05, |
|
"loss": 0.6888, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"grad_norm": 0.53662109375, |
|
"learning_rate": 3.6115319434803894e-05, |
|
"loss": 0.6845, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.326007326007326, |
|
"grad_norm": 0.59130859375, |
|
"learning_rate": 3.263043563534428e-05, |
|
"loss": 0.6731, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.472527472527473, |
|
"grad_norm": 0.564453125, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.6708, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.619047619047619, |
|
"grad_norm": 0.62158203125, |
|
"learning_rate": 2.6099108277934103e-05, |
|
"loss": 0.6906, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.7655677655677655, |
|
"grad_norm": 0.572265625, |
|
"learning_rate": 2.3066602901712108e-05, |
|
"loss": 0.6841, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 7.912087912087912, |
|
"grad_norm": 0.5771484375, |
|
"learning_rate": 2.0198277271976052e-05, |
|
"loss": 0.6664, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.058608058608058, |
|
"grad_norm": 0.587890625, |
|
"learning_rate": 1.750025254016978e-05, |
|
"loss": 0.6899, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 0.55810546875, |
|
"learning_rate": 1.4978286427038601e-05, |
|
"loss": 0.6723, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.351648351648352, |
|
"grad_norm": 0.560546875, |
|
"learning_rate": 1.2637760935363053e-05, |
|
"loss": 0.6494, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.498168498168498, |
|
"grad_norm": 0.568359375, |
|
"learning_rate": 1.0483670864493778e-05, |
|
"loss": 0.6958, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.644688644688644, |
|
"grad_norm": 0.5947265625, |
|
"learning_rate": 8.520613151197898e-06, |
|
"loss": 0.6637, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 8.791208791208792, |
|
"grad_norm": 0.5556640625, |
|
"learning_rate": 6.75277705956443e-06, |
|
"loss": 0.6876, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 8.937728937728938, |
|
"grad_norm": 0.60791015625, |
|
"learning_rate": 5.183935240903414e-06, |
|
"loss": 0.6656, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.084249084249084, |
|
"grad_norm": 0.51416015625, |
|
"learning_rate": 3.817435682718096e-06, |
|
"loss": 0.6786, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 0.56689453125, |
|
"learning_rate": 2.656194563930714e-06, |
|
"loss": 0.668, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 9.377289377289378, |
|
"grad_norm": 0.5615234375, |
|
"learning_rate": 1.7026900316098215e-06, |
|
"loss": 0.6681, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 9.523809523809524, |
|
"grad_norm": 0.587890625, |
|
"learning_rate": 9.589569124794916e-07, |
|
"loss": 0.6496, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 9.67032967032967, |
|
"grad_norm": 0.57373046875, |
|
"learning_rate": 4.2658237049655323e-07, |
|
"loss": 0.6891, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 9.816849816849818, |
|
"grad_norm": 0.572265625, |
|
"learning_rate": 1.0670251976275803e-07, |
|
"loss": 0.6689, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 9.963369963369964, |
|
"grad_norm": 0.57470703125, |
|
"learning_rate": 0.0, |
|
"loss": 0.6729, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 9.963369963369964, |
|
"step": 680, |
|
"total_flos": 3.318222409433088e+16, |
|
"train_loss": 0.735039118458243, |
|
"train_runtime": 593.4308, |
|
"train_samples_per_second": 4.6, |
|
"train_steps_per_second": 1.146 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 680, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 3.318222409433088e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|