|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.953917050691244, |
|
"eval_steps": 500, |
|
"global_step": 540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18433179723502305, |
|
"grad_norm": 15.2578125, |
|
"learning_rate": 0.00019983081582712685, |
|
"loss": 1.1335, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3686635944700461, |
|
"grad_norm": 15.1484375, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 1.0389, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5529953917050692, |
|
"grad_norm": 16.21875, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.0017, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7373271889400922, |
|
"grad_norm": 13.3359375, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.9937, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9216589861751152, |
|
"grad_norm": 13.1171875, |
|
"learning_rate": 0.0001957989512315489, |
|
"loss": 0.9773, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1059907834101383, |
|
"grad_norm": 11.2578125, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.9484, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 13.71875, |
|
"learning_rate": 0.00019182161068802741, |
|
"loss": 0.9577, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4746543778801844, |
|
"grad_norm": 10.65625, |
|
"learning_rate": 0.00018936326403234125, |
|
"loss": 0.9277, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6589861751152073, |
|
"grad_norm": 12.90625, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.9301, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.8433179723502304, |
|
"grad_norm": 13.7578125, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 0.9212, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0276497695852536, |
|
"grad_norm": 10.140625, |
|
"learning_rate": 0.0001802123192755044, |
|
"loss": 0.8985, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.2119815668202767, |
|
"grad_norm": 9.6015625, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.8939, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3963133640552994, |
|
"grad_norm": 11.921875, |
|
"learning_rate": 0.00017273736415730488, |
|
"loss": 0.8935, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 10.8203125, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.8973, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.7649769585253456, |
|
"grad_norm": 11.7421875, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 0.9522, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.9493087557603688, |
|
"grad_norm": 11.5234375, |
|
"learning_rate": 0.00015971585917027862, |
|
"loss": 0.9124, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.133640552995392, |
|
"grad_norm": 10.25, |
|
"learning_rate": 0.0001549508978070806, |
|
"loss": 0.899, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.3179723502304146, |
|
"grad_norm": 10.4609375, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.8893, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.5023041474654377, |
|
"grad_norm": 9.0078125, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 0.8878, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.686635944700461, |
|
"grad_norm": 10.21875, |
|
"learning_rate": 0.0001396079766039157, |
|
"loss": 0.8656, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.870967741935484, |
|
"grad_norm": 7.875, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 0.8897, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.055299539170507, |
|
"grad_norm": 9.4921875, |
|
"learning_rate": 0.00012868032327110904, |
|
"loss": 0.8549, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.23963133640553, |
|
"grad_norm": 9.2421875, |
|
"learning_rate": 0.00012306158707424403, |
|
"loss": 0.8577, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.423963133640553, |
|
"grad_norm": 7.875, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.8579, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.6082949308755765, |
|
"grad_norm": 8.1328125, |
|
"learning_rate": 0.00011160929141252303, |
|
"loss": 0.8278, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.792626728110599, |
|
"grad_norm": 7.01171875, |
|
"learning_rate": 0.00010581448289104758, |
|
"loss": 0.8458, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.976958525345622, |
|
"grad_norm": 6.92578125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.874, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.161290322580645, |
|
"grad_norm": 8.1796875, |
|
"learning_rate": 9.418551710895243e-05, |
|
"loss": 0.8455, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.345622119815668, |
|
"grad_norm": 7.21484375, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 0.8316, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.529953917050691, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.8555, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 7.5390625, |
|
"learning_rate": 7.693841292575598e-05, |
|
"loss": 0.8385, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.8986175115207375, |
|
"grad_norm": 6.8515625, |
|
"learning_rate": 7.131967672889101e-05, |
|
"loss": 0.8339, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.082949308755761, |
|
"grad_norm": 7.38671875, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 0.8497, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.267281105990784, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.836, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.451612903225806, |
|
"grad_norm": 8.515625, |
|
"learning_rate": 5.5120081979953785e-05, |
|
"loss": 0.8411, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 6.635944700460829, |
|
"grad_norm": 8.515625, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.822, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.820276497695852, |
|
"grad_norm": 7.73828125, |
|
"learning_rate": 4.50491021929194e-05, |
|
"loss": 0.8298, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.0046082949308754, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 4.028414082972141e-05, |
|
"loss": 0.8366, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.188940092165899, |
|
"grad_norm": 7.45703125, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.8222, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 7.373271889400922, |
|
"grad_norm": 7.94921875, |
|
"learning_rate": 3.137583621312665e-05, |
|
"loss": 0.8294, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.557603686635945, |
|
"grad_norm": 7.80078125, |
|
"learning_rate": 2.7262635842695127e-05, |
|
"loss": 0.8189, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 7.741935483870968, |
|
"grad_norm": 7.33984375, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.8279, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.926267281105991, |
|
"grad_norm": 10.0390625, |
|
"learning_rate": 1.9787680724495617e-05, |
|
"loss": 0.8299, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.110599078341014, |
|
"grad_norm": 8.34375, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.8408, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.294930875576037, |
|
"grad_norm": 7.22265625, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 0.819, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 8.47926267281106, |
|
"grad_norm": 7.890625, |
|
"learning_rate": 1.0636735967658784e-05, |
|
"loss": 0.8087, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 8.663594470046084, |
|
"grad_norm": 8.9296875, |
|
"learning_rate": 8.178389311972612e-06, |
|
"loss": 0.8218, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 8.847926267281107, |
|
"grad_norm": 9.421875, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.8213, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.03225806451613, |
|
"grad_norm": 7.0078125, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 0.8259, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.216589861751151, |
|
"grad_norm": 7.5546875, |
|
"learning_rate": 2.6955129420176196e-06, |
|
"loss": 0.8463, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.400921658986174, |
|
"grad_norm": 8.0546875, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 0.7932, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 9.585253456221198, |
|
"grad_norm": 8.109375, |
|
"learning_rate": 6.761642258056978e-07, |
|
"loss": 0.8172, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 9.76958525345622, |
|
"grad_norm": 7.5859375, |
|
"learning_rate": 1.6918417287318245e-07, |
|
"loss": 0.8263, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 9.953917050691244, |
|
"grad_norm": 8.59375, |
|
"learning_rate": 0.0, |
|
"loss": 0.8372, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.953917050691244, |
|
"step": 540, |
|
"total_flos": 3.52542013784064e+16, |
|
"train_loss": 0.8746989762341535, |
|
"train_runtime": 468.0964, |
|
"train_samples_per_second": 4.636, |
|
"train_steps_per_second": 1.154 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 540, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 3.52542013784064e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|