|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.12734195771253845, |
|
"eval_steps": 2, |
|
"global_step": 18720, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 4.896848787123102e-07, |
|
"loss": 1.9119, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 9.793697574246203e-07, |
|
"loss": 1.9115, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.4690546361369305e-06, |
|
"loss": 1.9003, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.9587395148492407e-06, |
|
"loss": 1.9034, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 2.4484243935615507e-06, |
|
"loss": 1.9145, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.5, |
|
"learning_rate": 2.938109272273861e-06, |
|
"loss": 1.8969, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 3.427794150986171e-06, |
|
"loss": 1.8644, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 3.917479029698481e-06, |
|
"loss": 1.8823, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 4.4071639084107905e-06, |
|
"loss": 1.8635, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 4.896848787123101e-06, |
|
"loss": 1.8545, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 5.386533665835411e-06, |
|
"loss": 1.871, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 5.876218544547722e-06, |
|
"loss": 1.8609, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 6.365903423260031e-06, |
|
"loss": 1.8788, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 6.855588301972342e-06, |
|
"loss": 1.8672, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 7.345273180684652e-06, |
|
"loss": 1.8723, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 7.834958059396963e-06, |
|
"loss": 1.879, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 8.324642938109274e-06, |
|
"loss": 1.8622, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.814327816821581e-06, |
|
"loss": 1.8533, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.304012695533892e-06, |
|
"loss": 1.8503, |
|
"step": 2736 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 9.793697574246203e-06, |
|
"loss": 1.8314, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.0283382452958513e-05, |
|
"loss": 1.835, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.0773067331670823e-05, |
|
"loss": 1.8526, |
|
"step": 3168 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.1262752210383133e-05, |
|
"loss": 1.8434, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.1752437089095444e-05, |
|
"loss": 1.8381, |
|
"step": 3456 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.2242121967807753e-05, |
|
"loss": 1.848, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.2731806846520062e-05, |
|
"loss": 1.8476, |
|
"step": 3744 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.3221491725232373e-05, |
|
"loss": 1.8307, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.3711176603944684e-05, |
|
"loss": 1.8328, |
|
"step": 4032 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.4200861482656995e-05, |
|
"loss": 1.833, |
|
"step": 4176 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.4690546361369304e-05, |
|
"loss": 1.8559, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.4999994886948122e-05, |
|
"loss": 1.8258, |
|
"step": 4464 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.499992935844023e-05, |
|
"loss": 1.8256, |
|
"step": 4608 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.4999788341738499e-05, |
|
"loss": 1.8315, |
|
"step": 4752 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.4999571838262296e-05, |
|
"loss": 1.8583, |
|
"step": 4896 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.4999279850190762e-05, |
|
"loss": 1.8287, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.4998912380462815e-05, |
|
"loss": 1.8134, |
|
"step": 5184 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.4998469432777108e-05, |
|
"loss": 1.8111, |
|
"step": 5328 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.4997951011591987e-05, |
|
"loss": 1.8199, |
|
"step": 5472 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.4997357122125465e-05, |
|
"loss": 1.8188, |
|
"step": 5616 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.4996687770355145e-05, |
|
"loss": 1.8023, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.4995942963018183e-05, |
|
"loss": 1.8131, |
|
"step": 5904 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.49951227076112e-05, |
|
"loss": 1.8056, |
|
"step": 6048 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.4994227012390224e-05, |
|
"loss": 1.8034, |
|
"step": 6192 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.4993255886370593e-05, |
|
"loss": 1.8221, |
|
"step": 6336 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.4992209339326872e-05, |
|
"loss": 1.7937, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4991087381792756e-05, |
|
"loss": 1.7936, |
|
"step": 6624 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.4989890025060953e-05, |
|
"loss": 1.804, |
|
"step": 6768 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4988617281183087e-05, |
|
"loss": 1.779, |
|
"step": 6912 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4987269162969559e-05, |
|
"loss": 1.7907, |
|
"step": 7056 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.4985845683989435e-05, |
|
"loss": 1.7945, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.4984346858570297e-05, |
|
"loss": 1.7887, |
|
"step": 7344 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.4982772701798105e-05, |
|
"loss": 1.7657, |
|
"step": 7488 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.4981123229517039e-05, |
|
"loss": 1.7914, |
|
"step": 7632 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.4979398458329347e-05, |
|
"loss": 1.7716, |
|
"step": 7776 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.4977598405595177e-05, |
|
"loss": 1.7979, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.4975723089432394e-05, |
|
"loss": 1.7751, |
|
"step": 8064 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4973772528716404e-05, |
|
"loss": 1.7868, |
|
"step": 8208 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.4971746743079972e-05, |
|
"loss": 1.7974, |
|
"step": 8352 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.4969645752913006e-05, |
|
"loss": 1.7895, |
|
"step": 8496 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.4967469579362364e-05, |
|
"loss": 1.7919, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.4965218244331647e-05, |
|
"loss": 1.7856, |
|
"step": 8784 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.4962891770480958e-05, |
|
"loss": 1.7777, |
|
"step": 8928 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.4960490181226693e-05, |
|
"loss": 1.7796, |
|
"step": 9072 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.49580135007413e-05, |
|
"loss": 1.7769, |
|
"step": 9216 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.495546175395303e-05, |
|
"loss": 1.7843, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.495283496654569e-05, |
|
"loss": 1.7763, |
|
"step": 9504 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.4950133164958392e-05, |
|
"loss": 1.7709, |
|
"step": 9648 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.4947356376385275e-05, |
|
"loss": 1.781, |
|
"step": 9792 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4944504628775233e-05, |
|
"loss": 1.7831, |
|
"step": 9936 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.4941577950831642e-05, |
|
"loss": 1.7835, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.4938576372012062e-05, |
|
"loss": 1.7648, |
|
"step": 10224 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.4935499922527949e-05, |
|
"loss": 1.774, |
|
"step": 10368 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.4932348633344338e-05, |
|
"loss": 1.7989, |
|
"step": 10512 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.492912253617955e-05, |
|
"loss": 1.7624, |
|
"step": 10656 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.4925821663504856e-05, |
|
"loss": 1.7584, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.4922446048544157e-05, |
|
"loss": 1.7719, |
|
"step": 10944 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.4918995725273649e-05, |
|
"loss": 1.7416, |
|
"step": 11088 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.4915470728421481e-05, |
|
"loss": 1.7587, |
|
"step": 11232 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.4911871093467414e-05, |
|
"loss": 1.7562, |
|
"step": 11376 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.4908196856642441e-05, |
|
"loss": 1.7579, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.4904448054928444e-05, |
|
"loss": 1.7734, |
|
"step": 11664 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.4900624726057816e-05, |
|
"loss": 1.7781, |
|
"step": 11808 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.489672690851308e-05, |
|
"loss": 1.7576, |
|
"step": 11952 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.4892754641526498e-05, |
|
"loss": 1.7561, |
|
"step": 12096 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4888707965079682e-05, |
|
"loss": 1.7602, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4884586919903186e-05, |
|
"loss": 1.7676, |
|
"step": 12384 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.4880391547476106e-05, |
|
"loss": 1.7546, |
|
"step": 12528 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.4876121890025647e-05, |
|
"loss": 1.7484, |
|
"step": 12672 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 1.4871777990526713e-05, |
|
"loss": 1.7425, |
|
"step": 12816 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.4867359892701466e-05, |
|
"loss": 1.7495, |
|
"step": 12960 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.4862867641018887e-05, |
|
"loss": 1.7478, |
|
"step": 13104 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4858301280694332e-05, |
|
"loss": 1.7648, |
|
"step": 13248 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.4853660857689073e-05, |
|
"loss": 1.7779, |
|
"step": 13392 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.4848946418709838e-05, |
|
"loss": 1.746, |
|
"step": 13536 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.4844158011208336e-05, |
|
"loss": 1.7482, |
|
"step": 13680 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.4839295683380785e-05, |
|
"loss": 1.7614, |
|
"step": 13824 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.4834359484167429e-05, |
|
"loss": 1.7394, |
|
"step": 13968 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.4829349463252034e-05, |
|
"loss": 1.7529, |
|
"step": 14112 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.48242656710614e-05, |
|
"loss": 1.741, |
|
"step": 14256 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.4819108158764847e-05, |
|
"loss": 1.7496, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.4813876978273708e-05, |
|
"loss": 1.7537, |
|
"step": 14544 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.4808572182240786e-05, |
|
"loss": 1.7435, |
|
"step": 14688 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.4803193824059852e-05, |
|
"loss": 1.7665, |
|
"step": 14832 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.4797741957865087e-05, |
|
"loss": 1.7547, |
|
"step": 14976 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.4792216638530545e-05, |
|
"loss": 1.7356, |
|
"step": 15120 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.4786617921669603e-05, |
|
"loss": 1.7397, |
|
"step": 15264 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.478094586363439e-05, |
|
"loss": 1.7239, |
|
"step": 15408 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.4775200521515237e-05, |
|
"loss": 1.7428, |
|
"step": 15552 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.4769381953140084e-05, |
|
"loss": 1.745, |
|
"step": 15696 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4763490217073917e-05, |
|
"loss": 1.7382, |
|
"step": 15840 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.475752537261816e-05, |
|
"loss": 1.7524, |
|
"step": 15984 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.475148747981009e-05, |
|
"loss": 1.7288, |
|
"step": 16128 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.4745376599422232e-05, |
|
"loss": 1.7336, |
|
"step": 16272 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.4739192792961736e-05, |
|
"loss": 1.7313, |
|
"step": 16416 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.4732936122669777e-05, |
|
"loss": 1.7537, |
|
"step": 16560 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4726606651520911e-05, |
|
"loss": 1.7428, |
|
"step": 16704 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.4720204443222452e-05, |
|
"loss": 1.7299, |
|
"step": 16848 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.4713729562213825e-05, |
|
"loss": 1.7276, |
|
"step": 16992 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.4707182073665921e-05, |
|
"loss": 1.7312, |
|
"step": 17136 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.470056204348044e-05, |
|
"loss": 1.7477, |
|
"step": 17280 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4693869538289221e-05, |
|
"loss": 1.7296, |
|
"step": 17424 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.4687104625453587e-05, |
|
"loss": 1.7353, |
|
"step": 17568 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.4680267373063651e-05, |
|
"loss": 1.7265, |
|
"step": 17712 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.467335784993764e-05, |
|
"loss": 1.7406, |
|
"step": 17856 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.4666376125621198e-05, |
|
"loss": 1.7313, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.4659322270386687e-05, |
|
"loss": 1.7614, |
|
"step": 18144 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.4652196355232481e-05, |
|
"loss": 1.7313, |
|
"step": 18288 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.4644998451882254e-05, |
|
"loss": 1.7307, |
|
"step": 18432 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.463772863278425e-05, |
|
"loss": 1.7291, |
|
"step": 18576 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.4630386971110556e-05, |
|
"loss": 1.7315, |
|
"step": 18720 |
|
} |
|
], |
|
"logging_steps": 144, |
|
"max_steps": 147005, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1440, |
|
"total_flos": 7.842747163103724e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|